linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH v1] media: uvcvideo: Cache URB header data before processing
@ 2018-06-27 10:34 Keiichi Watanabe
  2018-06-27 17:21 ` Kieran Bingham
                   ` (2 more replies)
  0 siblings, 3 replies; 23+ messages in thread
From: Keiichi Watanabe @ 2018-06-27 10:34 UTC (permalink / raw)
  To: linux-kernel
  Cc: Laurent Pinchart, Mauro Carvalho Chehab, linux-media,
	kieran.bingham, tfiga, dianders, keiichiw

On some platforms with non-coherent DMA (e.g. ARM), USB drivers use
uncached memory allocation methods. In such situations, it sometimes
takes a long time to access URB buffers.  This can be a cause of video
flickering problems if a resolution is high and a USB controller has
a very tight time limit. (e.g. dwc2) To avoid this problem, we copy
header data from (uncached) URB buffer into (cached) local buffer.

This change should make the elapsed time of the interrupt handler
shorter on platforms with non-coherent DMA. We measured the elapsed
time of each callback of uvc_video_complete without/with this patch
while capturing Full HD video in
https://webrtc.github.io/samples/src/content/getusermedia/resolution/.
I tested it on the top of Kieran Bingham's Asynchronous UVC series
https://www.mail-archive.com/linux-media@vger.kernel.org/msg128359.html.
The test device was Jerry Chromebook (RK3288) with Logitech Brio 4K.
I collected data for 5 seconds. (There were around 480 callbacks in
this case.) The following result shows that this patch makes
uvc_video_complete about 2x faster.

           | average | median  | min     | max     | standard deviation
w/o caching| 45319ns | 40250ns | 33834ns | 142625ns| 16611ns
w/  caching| 20620ns | 19250ns | 12250ns | 56583ns | 6285ns

In addition, we confirmed that this patch doesn't make it worse on
coherent DMA architecture by performing the same measurements on a
Broadwell Chromebox with the same camera.

           | average | median  | min     | max     | standard deviation
w/o caching| 21026ns | 21424ns | 12263ns | 23956ns | 1932ns
w/  caching| 20728ns | 20398ns |  8922ns | 45120ns | 3368ns

Signed-off-by: Keiichi Watanabe <keiichiw@chromium.org>
---

After applying 6 patches in
https://www.mail-archive.com/linux-media@vger.kernel.org/msg128359.html,
I measured elapsed time by adding the following code to
/drivers/media/usb/uvc/uvc_video.c

@@ -XXXX,6 +XXXX,9 @@ static void uvc_video_complete(struct urb *urb)
        struct uvc_video_queue *queue = &stream->queue;
 	struct uvc_buffer *buf = NULL;
 	int ret;
+	ktime_t start, end;
+	int elapsed_time;
+	start = ktime_get();
 	switch (urb->status) {
 	case 0:

@@ -XXXX,6 +XXXX,10 @@ static void uvc_video_complete(struct urb *urb)

 	INIT_WORK(&uvc_urb->work, uvc_video_copy_data_work);
 	queue_work(stream->async_wq, &uvc_urb->work);
+
+	end = ktime_get();
+	elapsed_time = ktime_to_ns(ktime_sub(end, start));
+	pr_err("elapsed time: %d ns", elapsed_time);
 }

 /*


 drivers/media/usb/uvc/uvc_video.c | 92 +++++++++++++++----------------
 1 file changed, 43 insertions(+), 49 deletions(-)
diff --git a/drivers/media/usb/uvc/uvc_video.c b/drivers/media/usb/uvc/uvc_video.c
index a88b2e51a666..ff2eddc55530 100644
--- a/drivers/media/usb/uvc/uvc_video.c
+++ b/drivers/media/usb/uvc/uvc_video.c
@@ -391,36 +391,15 @@ static inline ktime_t uvc_video_get_time(void)

 static void
 uvc_video_clock_decode(struct uvc_streaming *stream, struct uvc_buffer *buf,
-		       const u8 *data, int len)
+		       const u8 *data, int len, unsigned int header_size,
+		       bool has_pts, bool has_scr)
 {
 	struct uvc_clock_sample *sample;
-	unsigned int header_size;
-	bool has_pts = false;
-	bool has_scr = false;
 	unsigned long flags;
 	ktime_t time;
 	u16 host_sof;
 	u16 dev_sof;

-	switch (data[1] & (UVC_STREAM_PTS | UVC_STREAM_SCR)) {
-	case UVC_STREAM_PTS | UVC_STREAM_SCR:
-		header_size = 12;
-		has_pts = true;
-		has_scr = true;
-		break;
-	case UVC_STREAM_PTS:
-		header_size = 6;
-		has_pts = true;
-		break;
-	case UVC_STREAM_SCR:
-		header_size = 8;
-		has_scr = true;
-		break;
-	default:
-		header_size = 2;
-		break;
-	}
-
 	/* Check for invalid headers. */
 	if (len < header_size)
 		return;
@@ -717,11 +696,10 @@ void uvc_video_clock_update(struct uvc_streaming *stream,
  */

 static void uvc_video_stats_decode(struct uvc_streaming *stream,
-		const u8 *data, int len)
+				   const u8 *data, int len,
+				   unsigned int header_size, bool has_pts,
+				   bool has_scr)
 {
-	unsigned int header_size;
-	bool has_pts = false;
-	bool has_scr = false;
 	u16 uninitialized_var(scr_sof);
 	u32 uninitialized_var(scr_stc);
 	u32 uninitialized_var(pts);
@@ -730,25 +708,6 @@ static void uvc_video_stats_decode(struct uvc_streaming *stream,
 	    stream->stats.frame.nb_packets == 0)
 		stream->stats.stream.start_ts = ktime_get();

-	switch (data[1] & (UVC_STREAM_PTS | UVC_STREAM_SCR)) {
-	case UVC_STREAM_PTS | UVC_STREAM_SCR:
-		header_size = 12;
-		has_pts = true;
-		has_scr = true;
-		break;
-	case UVC_STREAM_PTS:
-		header_size = 6;
-		has_pts = true;
-		break;
-	case UVC_STREAM_SCR:
-		header_size = 8;
-		has_scr = true;
-		break;
-	default:
-		header_size = 2;
-		break;
-	}
-
 	/* Check for invalid headers. */
 	if (len < header_size || data[0] < header_size) {
 		stream->stats.frame.nb_invalid++;
@@ -957,10 +916,41 @@ static void uvc_video_stats_stop(struct uvc_streaming *stream)
  * to be called with a NULL buf parameter. uvc_video_decode_data and
  * uvc_video_decode_end will never be called with a NULL buffer.
  */
+static void uvc_video_decode_header_size(const u8 *data, int *header_size,
+					 bool *has_pts, bool *has_scr)
+{
+	switch (data[1] & (UVC_STREAM_PTS | UVC_STREAM_SCR)) {
+	case UVC_STREAM_PTS | UVC_STREAM_SCR:
+		*header_size = 12;
+		*has_pts = true;
+		*has_scr = true;
+		break;
+	case UVC_STREAM_PTS:
+		*header_size = 6;
+		*has_pts = true;
+		break;
+	case UVC_STREAM_SCR:
+		*header_size = 8;
+		*has_scr = true;
+		break;
+	default:
+		*header_size = 2;
+	}
+}
+
 static int uvc_video_decode_start(struct uvc_streaming *stream,
-		struct uvc_buffer *buf, const u8 *data, int len)
+				  struct uvc_buffer *buf, const u8 *urb_data,
+				  int len)
 {
 	u8 fid;
+	u8 data[12];
+	unsigned int header_size;
+	bool has_pts = false, has_scr = false;
+
+	/* Cache the header since urb_data is uncached memory. The size of
+	 * header is at most 12 bytes.
+	 */
+	memcpy(data, urb_data, min(len, 12));

 	/* Sanity checks:
 	 * - packet must be at least 2 bytes long
@@ -983,8 +973,12 @@ static int uvc_video_decode_start(struct uvc_streaming *stream,
 			uvc_video_stats_update(stream);
 	}

-	uvc_video_clock_decode(stream, buf, data, len);
-	uvc_video_stats_decode(stream, data, len);
+	uvc_video_decode_header_size(data, &header_size, &has_pts, &has_scr);
+
+	uvc_video_clock_decode(stream, buf, data, len, header_size, has_pts,
+			       has_scr);
+	uvc_video_stats_decode(stream, data, len, header_size, has_pts,
+			       has_scr);

 	/* Store the payload FID bit and return immediately when the buffer is
 	 * NULL.
--
2.18.0.rc2.346.g013aa6912e-goog

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v1] media: uvcvideo: Cache URB header data before processing
  2018-06-27 10:34 [RFC PATCH v1] media: uvcvideo: Cache URB header data before processing Keiichi Watanabe
@ 2018-06-27 17:21 ` Kieran Bingham
  2018-07-24  4:18   ` Keiichi Watanabe
  2018-07-24 11:03 ` Kieran Bingham
  2018-07-30 16:00 ` Laurent Pinchart
  2 siblings, 1 reply; 23+ messages in thread
From: Kieran Bingham @ 2018-06-27 17:21 UTC (permalink / raw)
  To: Keiichi Watanabe, linux-kernel
  Cc: Laurent Pinchart, Mauro Carvalho Chehab, linux-media, tfiga, dianders

Hi Keiichi,

I just wanted to reply here quickly to say this is a really interesting
development.

I would like to review and test it properly; however I am now on holiday
until at least the 9th July and so I won't be able to look until after
that date.

I didn't want you to be left hanging though :-)

Ping me if you haven't heard anything by the 20th or so. Of course if
anyone else gets round to testing and reviewing then that's great too.

Regards
--
Kieran



On 27/06/18 11:34, Keiichi Watanabe wrote:
> On some platforms with non-coherent DMA (e.g. ARM), USB drivers use
> uncached memory allocation methods. In such situations, it sometimes
> takes a long time to access URB buffers.  This can be a cause of video
> flickering problems if a resolution is high and a USB controller has
> a very tight time limit. (e.g. dwc2) To avoid this problem, we copy
> header data from (uncached) URB buffer into (cached) local buffer.
> 
> This change should make the elapsed time of the interrupt handler
> shorter on platforms with non-coherent DMA. We measured the elapsed
> time of each callback of uvc_video_complete without/with this patch
> while capturing Full HD video in
> https://webrtc.github.io/samples/src/content/getusermedia/resolution/.
> I tested it on the top of Kieran Bingham's Asynchronous UVC series
> https://www.mail-archive.com/linux-media@vger.kernel.org/msg128359.html.
> The test device was Jerry Chromebook (RK3288) with Logitech Brio 4K.
> I collected data for 5 seconds. (There were around 480 callbacks in
> this case.) The following result shows that this patch makes
> uvc_video_complete about 2x faster.
> 
>            | average | median  | min     | max     | standard deviation
> w/o caching| 45319ns | 40250ns | 33834ns | 142625ns| 16611ns
> w/  caching| 20620ns | 19250ns | 12250ns | 56583ns | 6285ns
> 
> In addition, we confirmed that this patch doesn't make it worse on
> coherent DMA architecture by performing the same measurements on a
> Broadwell Chromebox with the same camera.
> 
>            | average | median  | min     | max     | standard deviation
> w/o caching| 21026ns | 21424ns | 12263ns | 23956ns | 1932ns
> w/  caching| 20728ns | 20398ns |  8922ns | 45120ns | 3368ns
> 
> Signed-off-by: Keiichi Watanabe <keiichiw@chromium.org>
> ---
> 
> After applying 6 patches in
> https://www.mail-archive.com/linux-media@vger.kernel.org/msg128359.html,
> I measured elapsed time by adding the following code to
> /drivers/media/usb/uvc/uvc_video.c
> 
> @@ -XXXX,6 +XXXX,9 @@ static void uvc_video_complete(struct urb *urb)
>         struct uvc_video_queue *queue = &stream->queue;
>  	struct uvc_buffer *buf = NULL;
>  	int ret;
> +	ktime_t start, end;
> +	int elapsed_time;
> +	start = ktime_get();
>  	switch (urb->status) {
>  	case 0:
> 
> @@ -XXXX,6 +XXXX,10 @@ static void uvc_video_complete(struct urb *urb)
> 
>  	INIT_WORK(&uvc_urb->work, uvc_video_copy_data_work);
>  	queue_work(stream->async_wq, &uvc_urb->work);
> +
> +	end = ktime_get();
> +	elapsed_time = ktime_to_ns(ktime_sub(end, start));
> +	pr_err("elapsed time: %d ns", elapsed_time);
>  }
> 
>  /*
> 
> 
>  drivers/media/usb/uvc/uvc_video.c | 92 +++++++++++++++----------------
>  1 file changed, 43 insertions(+), 49 deletions(-)
> diff --git a/drivers/media/usb/uvc/uvc_video.c b/drivers/media/usb/uvc/uvc_video.c
> index a88b2e51a666..ff2eddc55530 100644
> --- a/drivers/media/usb/uvc/uvc_video.c
> +++ b/drivers/media/usb/uvc/uvc_video.c
> @@ -391,36 +391,15 @@ static inline ktime_t uvc_video_get_time(void)
> 
>  static void
>  uvc_video_clock_decode(struct uvc_streaming *stream, struct uvc_buffer *buf,
> -		       const u8 *data, int len)
> +		       const u8 *data, int len, unsigned int header_size,
> +		       bool has_pts, bool has_scr)
>  {
>  	struct uvc_clock_sample *sample;
> -	unsigned int header_size;
> -	bool has_pts = false;
> -	bool has_scr = false;
>  	unsigned long flags;
>  	ktime_t time;
>  	u16 host_sof;
>  	u16 dev_sof;
> 
> -	switch (data[1] & (UVC_STREAM_PTS | UVC_STREAM_SCR)) {
> -	case UVC_STREAM_PTS | UVC_STREAM_SCR:
> -		header_size = 12;
> -		has_pts = true;
> -		has_scr = true;
> -		break;
> -	case UVC_STREAM_PTS:
> -		header_size = 6;
> -		has_pts = true;
> -		break;
> -	case UVC_STREAM_SCR:
> -		header_size = 8;
> -		has_scr = true;
> -		break;
> -	default:
> -		header_size = 2;
> -		break;
> -	}
> -
>  	/* Check for invalid headers. */
>  	if (len < header_size)
>  		return;
> @@ -717,11 +696,10 @@ void uvc_video_clock_update(struct uvc_streaming *stream,
>   */
> 
>  static void uvc_video_stats_decode(struct uvc_streaming *stream,
> -		const u8 *data, int len)
> +				   const u8 *data, int len,
> +				   unsigned int header_size, bool has_pts,
> +				   bool has_scr)
>  {
> -	unsigned int header_size;
> -	bool has_pts = false;
> -	bool has_scr = false;
>  	u16 uninitialized_var(scr_sof);
>  	u32 uninitialized_var(scr_stc);
>  	u32 uninitialized_var(pts);
> @@ -730,25 +708,6 @@ static void uvc_video_stats_decode(struct uvc_streaming *stream,
>  	    stream->stats.frame.nb_packets == 0)
>  		stream->stats.stream.start_ts = ktime_get();
> 
> -	switch (data[1] & (UVC_STREAM_PTS | UVC_STREAM_SCR)) {
> -	case UVC_STREAM_PTS | UVC_STREAM_SCR:
> -		header_size = 12;
> -		has_pts = true;
> -		has_scr = true;
> -		break;
> -	case UVC_STREAM_PTS:
> -		header_size = 6;
> -		has_pts = true;
> -		break;
> -	case UVC_STREAM_SCR:
> -		header_size = 8;
> -		has_scr = true;
> -		break;
> -	default:
> -		header_size = 2;
> -		break;
> -	}
> -
>  	/* Check for invalid headers. */
>  	if (len < header_size || data[0] < header_size) {
>  		stream->stats.frame.nb_invalid++;
> @@ -957,10 +916,41 @@ static void uvc_video_stats_stop(struct uvc_streaming *stream)
>   * to be called with a NULL buf parameter. uvc_video_decode_data and
>   * uvc_video_decode_end will never be called with a NULL buffer.
>   */
> +static void uvc_video_decode_header_size(const u8 *data, int *header_size,
> +					 bool *has_pts, bool *has_scr)
> +{
> +	switch (data[1] & (UVC_STREAM_PTS | UVC_STREAM_SCR)) {
> +	case UVC_STREAM_PTS | UVC_STREAM_SCR:
> +		*header_size = 12;
> +		*has_pts = true;
> +		*has_scr = true;
> +		break;
> +	case UVC_STREAM_PTS:
> +		*header_size = 6;
> +		*has_pts = true;
> +		break;
> +	case UVC_STREAM_SCR:
> +		*header_size = 8;
> +		*has_scr = true;
> +		break;
> +	default:
> +		*header_size = 2;
> +	}
> +}
> +
>  static int uvc_video_decode_start(struct uvc_streaming *stream,
> -		struct uvc_buffer *buf, const u8 *data, int len)
> +				  struct uvc_buffer *buf, const u8 *urb_data,
> +				  int len)
>  {
>  	u8 fid;
> +	u8 data[12];
> +	unsigned int header_size;
> +	bool has_pts = false, has_scr = false;
> +
> +	/* Cache the header since urb_data is uncached memory. The size of
> +	 * header is at most 12 bytes.
> +	 */
> +	memcpy(data, urb_data, min(len, 12));
> 
>  	/* Sanity checks:
>  	 * - packet must be at least 2 bytes long
> @@ -983,8 +973,12 @@ static int uvc_video_decode_start(struct uvc_streaming *stream,
>  			uvc_video_stats_update(stream);
>  	}
> 
> -	uvc_video_clock_decode(stream, buf, data, len);
> -	uvc_video_stats_decode(stream, data, len);
> +	uvc_video_decode_header_size(data, &header_size, &has_pts, &has_scr);
> +
> +	uvc_video_clock_decode(stream, buf, data, len, header_size, has_pts,
> +			       has_scr);
> +	uvc_video_stats_decode(stream, data, len, header_size, has_pts,
> +			       has_scr);
> 
>  	/* Store the payload FID bit and return immediately when the buffer is
>  	 * NULL.
> --
> 2.18.0.rc2.346.g013aa6912e-goog
> 

-- 
Regards
--
Kieran

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v1] media: uvcvideo: Cache URB header data before processing
  2018-06-27 17:21 ` Kieran Bingham
@ 2018-07-24  4:18   ` Keiichi Watanabe
  2018-07-24 11:07     ` Kieran Bingham
  0 siblings, 1 reply; 23+ messages in thread
From: Keiichi Watanabe @ 2018-07-24  4:18 UTC (permalink / raw)
  To: kieran.bingham
  Cc: Linux Kernel Mailing List, Laurent Pinchart,
	Mauro Carvalho Chehab, Linux Media Mailing List, Tomasz Figa,
	Douglas Anderson

Hi Kieran,

How is it going?
I would appreciate if you could review this patch.

Best regards,
Keiichi

On Thu, Jun 28, 2018 at 2:21 AM, Kieran Bingham
<kieran.bingham@ideasonboard.com> wrote:
> Hi Keiichi,
>
> I just wanted to reply here quickly to say this is a really interesting
> development.
>
> I would like to review and test it properly; however I am now on holiday
> until at least the 9th July and so I won't be able to look until after
> that date.
>
> I didn't want you to be left hanging though :-)
>
> Ping me if you haven't heard anything by the 20th or so. Of course if
> anyone else gets round to testing and reviewing then that's great too.
>
> Regards
> --
> Kieran
>
>
>
> On 27/06/18 11:34, Keiichi Watanabe wrote:
>> On some platforms with non-coherent DMA (e.g. ARM), USB drivers use
>> uncached memory allocation methods. In such situations, it sometimes
>> takes a long time to access URB buffers.  This can be a cause of video
>> flickering problems if a resolution is high and a USB controller has
>> a very tight time limit. (e.g. dwc2) To avoid this problem, we copy
>> header data from (uncached) URB buffer into (cached) local buffer.
>>
>> This change should make the elapsed time of the interrupt handler
>> shorter on platforms with non-coherent DMA. We measured the elapsed
>> time of each callback of uvc_video_complete without/with this patch
>> while capturing Full HD video in
>> https://webrtc.github.io/samples/src/content/getusermedia/resolution/.
>> I tested it on the top of Kieran Bingham's Asynchronous UVC series
>> https://www.mail-archive.com/linux-media@vger.kernel.org/msg128359.html.
>> The test device was Jerry Chromebook (RK3288) with Logitech Brio 4K.
>> I collected data for 5 seconds. (There were around 480 callbacks in
>> this case.) The following result shows that this patch makes
>> uvc_video_complete about 2x faster.
>>
>>            | average | median  | min     | max     | standard deviation
>> w/o caching| 45319ns | 40250ns | 33834ns | 142625ns| 16611ns
>> w/  caching| 20620ns | 19250ns | 12250ns | 56583ns | 6285ns
>>
>> In addition, we confirmed that this patch doesn't make it worse on
>> coherent DMA architecture by performing the same measurements on a
>> Broadwell Chromebox with the same camera.
>>
>>            | average | median  | min     | max     | standard deviation
>> w/o caching| 21026ns | 21424ns | 12263ns | 23956ns | 1932ns
>> w/  caching| 20728ns | 20398ns |  8922ns | 45120ns | 3368ns
>>
>> Signed-off-by: Keiichi Watanabe <keiichiw@chromium.org>
>> ---
>>
>> After applying 6 patches in
>> https://www.mail-archive.com/linux-media@vger.kernel.org/msg128359.html,
>> I measured elapsed time by adding the following code to
>> /drivers/media/usb/uvc/uvc_video.c
>>
>> @@ -XXXX,6 +XXXX,9 @@ static void uvc_video_complete(struct urb *urb)
>>         struct uvc_video_queue *queue = &stream->queue;
>>       struct uvc_buffer *buf = NULL;
>>       int ret;
>> +     ktime_t start, end;
>> +     int elapsed_time;
>> +     start = ktime_get();
>>       switch (urb->status) {
>>       case 0:
>>
>> @@ -XXXX,6 +XXXX,10 @@ static void uvc_video_complete(struct urb *urb)
>>
>>       INIT_WORK(&uvc_urb->work, uvc_video_copy_data_work);
>>       queue_work(stream->async_wq, &uvc_urb->work);
>> +
>> +     end = ktime_get();
>> +     elapsed_time = ktime_to_ns(ktime_sub(end, start));
>> +     pr_err("elapsed time: %d ns", elapsed_time);
>>  }
>>
>>  /*
>>
>>
>>  drivers/media/usb/uvc/uvc_video.c | 92 +++++++++++++++----------------
>>  1 file changed, 43 insertions(+), 49 deletions(-)
>> diff --git a/drivers/media/usb/uvc/uvc_video.c b/drivers/media/usb/uvc/uvc_video.c
>> index a88b2e51a666..ff2eddc55530 100644
>> --- a/drivers/media/usb/uvc/uvc_video.c
>> +++ b/drivers/media/usb/uvc/uvc_video.c
>> @@ -391,36 +391,15 @@ static inline ktime_t uvc_video_get_time(void)
>>
>>  static void
>>  uvc_video_clock_decode(struct uvc_streaming *stream, struct uvc_buffer *buf,
>> -                    const u8 *data, int len)
>> +                    const u8 *data, int len, unsigned int header_size,
>> +                    bool has_pts, bool has_scr)
>>  {
>>       struct uvc_clock_sample *sample;
>> -     unsigned int header_size;
>> -     bool has_pts = false;
>> -     bool has_scr = false;
>>       unsigned long flags;
>>       ktime_t time;
>>       u16 host_sof;
>>       u16 dev_sof;
>>
>> -     switch (data[1] & (UVC_STREAM_PTS | UVC_STREAM_SCR)) {
>> -     case UVC_STREAM_PTS | UVC_STREAM_SCR:
>> -             header_size = 12;
>> -             has_pts = true;
>> -             has_scr = true;
>> -             break;
>> -     case UVC_STREAM_PTS:
>> -             header_size = 6;
>> -             has_pts = true;
>> -             break;
>> -     case UVC_STREAM_SCR:
>> -             header_size = 8;
>> -             has_scr = true;
>> -             break;
>> -     default:
>> -             header_size = 2;
>> -             break;
>> -     }
>> -
>>       /* Check for invalid headers. */
>>       if (len < header_size)
>>               return;
>> @@ -717,11 +696,10 @@ void uvc_video_clock_update(struct uvc_streaming *stream,
>>   */
>>
>>  static void uvc_video_stats_decode(struct uvc_streaming *stream,
>> -             const u8 *data, int len)
>> +                                const u8 *data, int len,
>> +                                unsigned int header_size, bool has_pts,
>> +                                bool has_scr)
>>  {
>> -     unsigned int header_size;
>> -     bool has_pts = false;
>> -     bool has_scr = false;
>>       u16 uninitialized_var(scr_sof);
>>       u32 uninitialized_var(scr_stc);
>>       u32 uninitialized_var(pts);
>> @@ -730,25 +708,6 @@ static void uvc_video_stats_decode(struct uvc_streaming *stream,
>>           stream->stats.frame.nb_packets == 0)
>>               stream->stats.stream.start_ts = ktime_get();
>>
>> -     switch (data[1] & (UVC_STREAM_PTS | UVC_STREAM_SCR)) {
>> -     case UVC_STREAM_PTS | UVC_STREAM_SCR:
>> -             header_size = 12;
>> -             has_pts = true;
>> -             has_scr = true;
>> -             break;
>> -     case UVC_STREAM_PTS:
>> -             header_size = 6;
>> -             has_pts = true;
>> -             break;
>> -     case UVC_STREAM_SCR:
>> -             header_size = 8;
>> -             has_scr = true;
>> -             break;
>> -     default:
>> -             header_size = 2;
>> -             break;
>> -     }
>> -
>>       /* Check for invalid headers. */
>>       if (len < header_size || data[0] < header_size) {
>>               stream->stats.frame.nb_invalid++;
>> @@ -957,10 +916,41 @@ static void uvc_video_stats_stop(struct uvc_streaming *stream)
>>   * to be called with a NULL buf parameter. uvc_video_decode_data and
>>   * uvc_video_decode_end will never be called with a NULL buffer.
>>   */
>> +static void uvc_video_decode_header_size(const u8 *data, int *header_size,
>> +                                      bool *has_pts, bool *has_scr)
>> +{
>> +     switch (data[1] & (UVC_STREAM_PTS | UVC_STREAM_SCR)) {
>> +     case UVC_STREAM_PTS | UVC_STREAM_SCR:
>> +             *header_size = 12;
>> +             *has_pts = true;
>> +             *has_scr = true;
>> +             break;
>> +     case UVC_STREAM_PTS:
>> +             *header_size = 6;
>> +             *has_pts = true;
>> +             break;
>> +     case UVC_STREAM_SCR:
>> +             *header_size = 8;
>> +             *has_scr = true;
>> +             break;
>> +     default:
>> +             *header_size = 2;
>> +     }
>> +}
>> +
>>  static int uvc_video_decode_start(struct uvc_streaming *stream,
>> -             struct uvc_buffer *buf, const u8 *data, int len)
>> +                               struct uvc_buffer *buf, const u8 *urb_data,
>> +                               int len)
>>  {
>>       u8 fid;
>> +     u8 data[12];
>> +     unsigned int header_size;
>> +     bool has_pts = false, has_scr = false;
>> +
>> +     /* Cache the header since urb_data is uncached memory. The size of
>> +      * header is at most 12 bytes.
>> +      */
>> +     memcpy(data, urb_data, min(len, 12));
>>
>>       /* Sanity checks:
>>        * - packet must be at least 2 bytes long
>> @@ -983,8 +973,12 @@ static int uvc_video_decode_start(struct uvc_streaming *stream,
>>                       uvc_video_stats_update(stream);
>>       }
>>
>> -     uvc_video_clock_decode(stream, buf, data, len);
>> -     uvc_video_stats_decode(stream, data, len);
>> +     uvc_video_decode_header_size(data, &header_size, &has_pts, &has_scr);
>> +
>> +     uvc_video_clock_decode(stream, buf, data, len, header_size, has_pts,
>> +                            has_scr);
>> +     uvc_video_stats_decode(stream, data, len, header_size, has_pts,
>> +                            has_scr);
>>
>>       /* Store the payload FID bit and return immediately when the buffer is
>>        * NULL.
>> --
>> 2.18.0.rc2.346.g013aa6912e-goog
>>
>
> --
> Regards
> --
> Kieran

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v1] media: uvcvideo: Cache URB header data before processing
  2018-06-27 10:34 [RFC PATCH v1] media: uvcvideo: Cache URB header data before processing Keiichi Watanabe
  2018-06-27 17:21 ` Kieran Bingham
@ 2018-07-24 11:03 ` Kieran Bingham
  2018-07-30 16:00 ` Laurent Pinchart
  2 siblings, 0 replies; 23+ messages in thread
From: Kieran Bingham @ 2018-07-24 11:03 UTC (permalink / raw)
  To: Keiichi Watanabe, linux-kernel, Laurent Pinchart
  Cc: Mauro Carvalho Chehab, linux-media, tfiga, dianders

Hi Keiichi

On 27/06/18 11:34, Keiichi Watanabe wrote:
> On some platforms with non-coherent DMA (e.g. ARM), USB drivers use
> uncached memory allocation methods. In such situations, it sometimes
> takes a long time to access URB buffers.  This can be a cause of video
> flickering problems if a resolution is high and a USB controller has
> a very tight time limit. (e.g. dwc2) To avoid this problem, we copy
> header data from (uncached) URB buffer into (cached) local buffer.
> 
> This change should make the elapsed time of the interrupt handler
> shorter on platforms with non-coherent DMA. We measured the elapsed
> time of each callback of uvc_video_complete without/with this patch
> while capturing Full HD video in
> https://webrtc.github.io/samples/src/content/getusermedia/resolution/.
> I tested it on the top of Kieran Bingham's Asynchronous UVC series
> https://www.mail-archive.com/linux-media@vger.kernel.org/msg128359.html.

I've nudged Laurent to get my Async patches reviewed - but I don't think
it's going to make it into v4.19 at the moment :(


> The test device was Jerry Chromebook (RK3288) with Logitech Brio 4K.
> I collected data for 5 seconds. (There were around 480 callbacks in
> this case.) The following result shows that this patch makes
> uvc_video_complete about 2x faster.
>            | average | median  | min     | max     | standard deviation
> w/o caching| 45319ns | 40250ns | 33834ns | 142625ns| 16611ns
> w/  caching| 20620ns | 19250ns | 12250ns | 56583ns | 6285ns
> 
> In addition, we confirmed that this patch doesn't make it worse on
> coherent DMA architecture by performing the same measurements on a
> Broadwell Chromebox with the same camera.
> 
>            | average | median  | min     | max     | standard deviation
> w/o caching| 21026ns | 21424ns | 12263ns | 23956ns | 1932ns
> w/  caching| 20728ns | 20398ns |  8922ns | 45120ns | 3368ns

Uhm .... what happened there though. The Max has increased?

This is technically still faster than the Chromebook, - but slower than
the "w/o caching" line of the same test. Will this have any impact on
any other use-cases?


> Signed-off-by: Keiichi Watanabe <keiichiw@chromium.org>
> ---
> 
> After applying 6 patches in
> https://www.mail-archive.com/linux-media@vger.kernel.org/msg128359.html,
> I measured elapsed time by adding the following code to
> /drivers/media/usb/uvc/uvc_video.c
> 
> @@ -XXXX,6 +XXXX,9 @@ static void uvc_video_complete(struct urb *urb)
>         struct uvc_video_queue *queue = &stream->queue;
>  	struct uvc_buffer *buf = NULL;
>  	int ret;
> +	ktime_t start, end;
> +	int elapsed_time;
> +	start = ktime_get();
>  	switch (urb->status) {
>  	case 0:
> 
> @@ -XXXX,6 +XXXX,10 @@ static void uvc_video_complete(struct urb *urb)
> 
>  	INIT_WORK(&uvc_urb->work, uvc_video_copy_data_work);
>  	queue_work(stream->async_wq, &uvc_urb->work);
> +
> +	end = ktime_get();
> +	elapsed_time = ktime_to_ns(ktime_sub(end, start));
> +	pr_err("elapsed time: %d ns", elapsed_time);
>  }
> 
>  /*
> 
> 
>  drivers/media/usb/uvc/uvc_video.c | 92 +++++++++++++++----------------
>  1 file changed, 43 insertions(+), 49 deletions(-)
> diff --git a/drivers/media/usb/uvc/uvc_video.c b/drivers/media/usb/uvc/uvc_video.c
> index a88b2e51a666..ff2eddc55530 100644
> --- a/drivers/media/usb/uvc/uvc_video.c
> +++ b/drivers/media/usb/uvc/uvc_video.c
> @@ -391,36 +391,15 @@ static inline ktime_t uvc_video_get_time(void)
> 
>  static void
>  uvc_video_clock_decode(struct uvc_streaming *stream, struct uvc_buffer *buf,
> -		       const u8 *data, int len)
> +		       const u8 *data, int len, unsigned int header_size,
> +		       bool has_pts, bool has_scr)
>  {
>  	struct uvc_clock_sample *sample;
> -	unsigned int header_size;
> -	bool has_pts = false;
> -	bool has_scr = false;
>  	unsigned long flags;
>  	ktime_t time;
>  	u16 host_sof;
>  	u16 dev_sof;
> 
> -	switch (data[1] & (UVC_STREAM_PTS | UVC_STREAM_SCR)) {
> -	case UVC_STREAM_PTS | UVC_STREAM_SCR:
> -		header_size = 12;
> -		has_pts = true;
> -		has_scr = true;
> -		break;
> -	case UVC_STREAM_PTS:
> -		header_size = 6;
> -		has_pts = true;
> -		break;
> -	case UVC_STREAM_SCR:
> -		header_size = 8;
> -		has_scr = true;
> -		break;
> -	default:
> -		header_size = 2;
> -		break;
> -	}
> -
>  	/* Check for invalid headers. */
>  	if (len < header_size)
>  		return;
> @@ -717,11 +696,10 @@ void uvc_video_clock_update(struct uvc_streaming *stream,
>   */
> 
>  static void uvc_video_stats_decode(struct uvc_streaming *stream,
> -		const u8 *data, int len)
> +				   const u8 *data, int len,
> +				   unsigned int header_size, bool has_pts,
> +				   bool has_scr)
>  {
> -	unsigned int header_size;
> -	bool has_pts = false;
> -	bool has_scr = false;
>  	u16 uninitialized_var(scr_sof);
>  	u32 uninitialized_var(scr_stc);
>  	u32 uninitialized_var(pts);
> @@ -730,25 +708,6 @@ static void uvc_video_stats_decode(struct uvc_streaming *stream,
>  	    stream->stats.frame.nb_packets == 0)
>  		stream->stats.stream.start_ts = ktime_get();
> 
> -	switch (data[1] & (UVC_STREAM_PTS | UVC_STREAM_SCR)) {
> -	case UVC_STREAM_PTS | UVC_STREAM_SCR:
> -		header_size = 12;
> -		has_pts = true;
> -		has_scr = true;
> -		break;
> -	case UVC_STREAM_PTS:
> -		header_size = 6;
> -		has_pts = true;
> -		break;
> -	case UVC_STREAM_SCR:
> -		header_size = 8;
> -		has_scr = true;
> -		break;
> -	default:
> -		header_size = 2;
> -		break;
> -	}
> -
>  	/* Check for invalid headers. */
>  	if (len < header_size || data[0] < header_size) {
>  		stream->stats.frame.nb_invalid++;
> @@ -957,10 +916,41 @@ static void uvc_video_stats_stop(struct uvc_streaming *stream)
>   * to be called with a NULL buf parameter. uvc_video_decode_data and
>   * uvc_video_decode_end will never be called with a NULL buffer.
>   */
> +static void uvc_video_decode_header_size(const u8 *data, int *header_size,
> +					 bool *has_pts, bool *has_scr)

Rather than passing in pointers, could/should we add these fields to the
struct uvc_buffer? I think they are properties of the buffer object so
they could live there. Then the function prototypes would be kept cleaner.

Although, actually there might be multiple packets with timestamps per
uvc_buffer object, so maybe it needs it's own struct. Or perhaps it's
more appropriate to the stream object, as they refernce the 'last known
time' ?



> +{
> +	switch (data[1] & (UVC_STREAM_PTS | UVC_STREAM_SCR)) {
> +	case UVC_STREAM_PTS | UVC_STREAM_SCR:
> +		*header_size = 12;
> +		*has_pts = true;
> +		*has_scr = true;
> +		break;
> +	case UVC_STREAM_PTS:
> +		*header_size = 6;
> +		*has_pts = true;
> +		break;
> +	case UVC_STREAM_SCR:
> +		*header_size = 8;
> +		*has_scr = true;
> +		break;
> +	default:
> +		*header_size = 2;
> +	}

It looks like we are also extracting the pts, and scr multiple times for
both the clock handling and the stats handling.

As we use "get_unaligned_leXX" macro's for that - I wonder if that's a
slow path that aught to be done only once where possible too?



> +}
> +
>  static int uvc_video_decode_start(struct uvc_streaming *stream,
> -		struct uvc_buffer *buf, const u8 *data, int len)
> +				  struct uvc_buffer *buf, const u8 *urb_data,
> +				  int len)
>  {
>  	u8 fid;
> +	u8 data[12];
> +	unsigned int header_size;
> +	bool has_pts = false, has_scr = false;
> +
> +	/* Cache the header since urb_data is uncached memory. The size of
> +	 * header is at most 12 bytes.
> +	 */
> +	memcpy(data, urb_data, min(len, 12));
> 
>  	/* Sanity checks:
>  	 * - packet must be at least 2 bytes long
> @@ -983,8 +973,12 @@ static int uvc_video_decode_start(struct uvc_streaming *stream,
>  			uvc_video_stats_update(stream);
>  	}
> 
> -	uvc_video_clock_decode(stream, buf, data, len);
> -	uvc_video_stats_decode(stream, data, len);
> +	uvc_video_decode_header_size(data, &header_size, &has_pts, &has_scr);

Perhaps just "uvc_video_decode_header()", or to match the style of the
other functions:

	"uvc_video_header_decode()"?


> +
> +	uvc_video_clock_decode(stream, buf, data, len, header_size, has_pts,
> +			       has_scr);
> +	uvc_video_stats_decode(stream, data, len, header_size, has_pts,
> +			       has_scr);
> 
>  	/* Store the payload FID bit and return immediately when the buffer is
>  	 * NULL.
> --
> 2.18.0.rc2.346.g013aa6912e-goog
> 

-- 
Regards
--
Kieran

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v1] media: uvcvideo: Cache URB header data before processing
  2018-07-24  4:18   ` Keiichi Watanabe
@ 2018-07-24 11:07     ` Kieran Bingham
  0 siblings, 0 replies; 23+ messages in thread
From: Kieran Bingham @ 2018-07-24 11:07 UTC (permalink / raw)
  To: Keiichi Watanabe, Laurent Pinchart
  Cc: Linux Kernel Mailing List, Mauro Carvalho Chehab,
	Linux Media Mailing List, Tomasz Figa, Douglas Anderson

Hi Keiichi,

On 24/07/18 05:18, Keiichi Watanabe wrote:
> Hi Kieran,
> 
> How is it going?
> I would appreciate if you could review this patch.

Thanks for the ping. ... This was on my todo list ... but just ... sooo
many Todo's :D

Ping bumped you to the top :)
And done.

Now to ping Laurent about my async patches :) <ping>

--
Regards

Kieran


> Best regards,
> Keiichi
> 
> On Thu, Jun 28, 2018 at 2:21 AM, Kieran Bingham
> <kieran.bingham@ideasonboard.com> wrote:
>> Hi Keiichi,
>>
>> I just wanted to reply here quickly to say this is a really interesting
>> development.
>>
>> I would like to review and test it properly; however I am now on holiday
>> until at least the 9th July and so I won't be able to look until after
>> that date.
>>
>> I didn't want you to be left hanging though :-)
>>
>> Ping me if you haven't heard anything by the 20th or so. Of course if
>> anyone else gets round to testing and reviewing then that's great too.
>>
>> Regards
>> --
>> Kieran
>>
>>
>>
>> On 27/06/18 11:34, Keiichi Watanabe wrote:
>>> On some platforms with non-coherent DMA (e.g. ARM), USB drivers use
>>> uncached memory allocation methods. In such situations, it sometimes
>>> takes a long time to access URB buffers.  This can be a cause of video
>>> flickering problems if a resolution is high and a USB controller has
>>> a very tight time limit. (e.g. dwc2) To avoid this problem, we copy
>>> header data from (uncached) URB buffer into (cached) local buffer.
>>>
>>> This change should make the elapsed time of the interrupt handler
>>> shorter on platforms with non-coherent DMA. We measured the elapsed
>>> time of each callback of uvc_video_complete without/with this patch
>>> while capturing Full HD video in
>>> https://webrtc.github.io/samples/src/content/getusermedia/resolution/.
>>> I tested it on the top of Kieran Bingham's Asynchronous UVC series
>>> https://www.mail-archive.com/linux-media@vger.kernel.org/msg128359.html.
>>> The test device was Jerry Chromebook (RK3288) with Logitech Brio 4K.
>>> I collected data for 5 seconds. (There were around 480 callbacks in
>>> this case.) The following result shows that this patch makes
>>> uvc_video_complete about 2x faster.
>>>
>>>            | average | median  | min     | max     | standard deviation
>>> w/o caching| 45319ns | 40250ns | 33834ns | 142625ns| 16611ns
>>> w/  caching| 20620ns | 19250ns | 12250ns | 56583ns | 6285ns
>>>
>>> In addition, we confirmed that this patch doesn't make it worse on
>>> coherent DMA architecture by performing the same measurements on a
>>> Broadwell Chromebox with the same camera.
>>>
>>>            | average | median  | min     | max     | standard deviation
>>> w/o caching| 21026ns | 21424ns | 12263ns | 23956ns | 1932ns
>>> w/  caching| 20728ns | 20398ns |  8922ns | 45120ns | 3368ns
>>>
>>> Signed-off-by: Keiichi Watanabe <keiichiw@chromium.org>
>>> ---
>>>
>>> After applying 6 patches in
>>> https://www.mail-archive.com/linux-media@vger.kernel.org/msg128359.html,
>>> I measured elapsed time by adding the following code to
>>> /drivers/media/usb/uvc/uvc_video.c
>>>
>>> @@ -XXXX,6 +XXXX,9 @@ static void uvc_video_complete(struct urb *urb)
>>>         struct uvc_video_queue *queue = &stream->queue;
>>>       struct uvc_buffer *buf = NULL;
>>>       int ret;
>>> +     ktime_t start, end;
>>> +     int elapsed_time;
>>> +     start = ktime_get();
>>>       switch (urb->status) {
>>>       case 0:
>>>
>>> @@ -XXXX,6 +XXXX,10 @@ static void uvc_video_complete(struct urb *urb)
>>>
>>>       INIT_WORK(&uvc_urb->work, uvc_video_copy_data_work);
>>>       queue_work(stream->async_wq, &uvc_urb->work);
>>> +
>>> +     end = ktime_get();
>>> +     elapsed_time = ktime_to_ns(ktime_sub(end, start));
>>> +     pr_err("elapsed time: %d ns", elapsed_time);
>>>  }
>>>
>>>  /*
>>>
>>>
>>>  drivers/media/usb/uvc/uvc_video.c | 92 +++++++++++++++----------------
>>>  1 file changed, 43 insertions(+), 49 deletions(-)
>>> diff --git a/drivers/media/usb/uvc/uvc_video.c b/drivers/media/usb/uvc/uvc_video.c
>>> index a88b2e51a666..ff2eddc55530 100644
>>> --- a/drivers/media/usb/uvc/uvc_video.c
>>> +++ b/drivers/media/usb/uvc/uvc_video.c
>>> @@ -391,36 +391,15 @@ static inline ktime_t uvc_video_get_time(void)
>>>
>>>  static void
>>>  uvc_video_clock_decode(struct uvc_streaming *stream, struct uvc_buffer *buf,
>>> -                    const u8 *data, int len)
>>> +                    const u8 *data, int len, unsigned int header_size,
>>> +                    bool has_pts, bool has_scr)
>>>  {
>>>       struct uvc_clock_sample *sample;
>>> -     unsigned int header_size;
>>> -     bool has_pts = false;
>>> -     bool has_scr = false;
>>>       unsigned long flags;
>>>       ktime_t time;
>>>       u16 host_sof;
>>>       u16 dev_sof;
>>>
>>> -     switch (data[1] & (UVC_STREAM_PTS | UVC_STREAM_SCR)) {
>>> -     case UVC_STREAM_PTS | UVC_STREAM_SCR:
>>> -             header_size = 12;
>>> -             has_pts = true;
>>> -             has_scr = true;
>>> -             break;
>>> -     case UVC_STREAM_PTS:
>>> -             header_size = 6;
>>> -             has_pts = true;
>>> -             break;
>>> -     case UVC_STREAM_SCR:
>>> -             header_size = 8;
>>> -             has_scr = true;
>>> -             break;
>>> -     default:
>>> -             header_size = 2;
>>> -             break;
>>> -     }
>>> -
>>>       /* Check for invalid headers. */
>>>       if (len < header_size)
>>>               return;
>>> @@ -717,11 +696,10 @@ void uvc_video_clock_update(struct uvc_streaming *stream,
>>>   */
>>>
>>>  static void uvc_video_stats_decode(struct uvc_streaming *stream,
>>> -             const u8 *data, int len)
>>> +                                const u8 *data, int len,
>>> +                                unsigned int header_size, bool has_pts,
>>> +                                bool has_scr)
>>>  {
>>> -     unsigned int header_size;
>>> -     bool has_pts = false;
>>> -     bool has_scr = false;
>>>       u16 uninitialized_var(scr_sof);
>>>       u32 uninitialized_var(scr_stc);
>>>       u32 uninitialized_var(pts);
>>> @@ -730,25 +708,6 @@ static void uvc_video_stats_decode(struct uvc_streaming *stream,
>>>           stream->stats.frame.nb_packets == 0)
>>>               stream->stats.stream.start_ts = ktime_get();
>>>
>>> -     switch (data[1] & (UVC_STREAM_PTS | UVC_STREAM_SCR)) {
>>> -     case UVC_STREAM_PTS | UVC_STREAM_SCR:
>>> -             header_size = 12;
>>> -             has_pts = true;
>>> -             has_scr = true;
>>> -             break;
>>> -     case UVC_STREAM_PTS:
>>> -             header_size = 6;
>>> -             has_pts = true;
>>> -             break;
>>> -     case UVC_STREAM_SCR:
>>> -             header_size = 8;
>>> -             has_scr = true;
>>> -             break;
>>> -     default:
>>> -             header_size = 2;
>>> -             break;
>>> -     }
>>> -
>>>       /* Check for invalid headers. */
>>>       if (len < header_size || data[0] < header_size) {
>>>               stream->stats.frame.nb_invalid++;
>>> @@ -957,10 +916,41 @@ static void uvc_video_stats_stop(struct uvc_streaming *stream)
>>>   * to be called with a NULL buf parameter. uvc_video_decode_data and
>>>   * uvc_video_decode_end will never be called with a NULL buffer.
>>>   */
>>> +static void uvc_video_decode_header_size(const u8 *data, int *header_size,
>>> +                                      bool *has_pts, bool *has_scr)
>>> +{
>>> +     switch (data[1] & (UVC_STREAM_PTS | UVC_STREAM_SCR)) {
>>> +     case UVC_STREAM_PTS | UVC_STREAM_SCR:
>>> +             *header_size = 12;
>>> +             *has_pts = true;
>>> +             *has_scr = true;
>>> +             break;
>>> +     case UVC_STREAM_PTS:
>>> +             *header_size = 6;
>>> +             *has_pts = true;
>>> +             break;
>>> +     case UVC_STREAM_SCR:
>>> +             *header_size = 8;
>>> +             *has_scr = true;
>>> +             break;
>>> +     default:
>>> +             *header_size = 2;
>>> +     }
>>> +}
>>> +
>>>  static int uvc_video_decode_start(struct uvc_streaming *stream,
>>> -             struct uvc_buffer *buf, const u8 *data, int len)
>>> +                               struct uvc_buffer *buf, const u8 *urb_data,
>>> +                               int len)
>>>  {
>>>       u8 fid;
>>> +     u8 data[12];
>>> +     unsigned int header_size;
>>> +     bool has_pts = false, has_scr = false;
>>> +
>>> +     /* Cache the header since urb_data is uncached memory. The size of
>>> +      * header is at most 12 bytes.
>>> +      */
>>> +     memcpy(data, urb_data, min(len, 12));
>>>
>>>       /* Sanity checks:
>>>        * - packet must be at least 2 bytes long
>>> @@ -983,8 +973,12 @@ static int uvc_video_decode_start(struct uvc_streaming *stream,
>>>                       uvc_video_stats_update(stream);
>>>       }
>>>
>>> -     uvc_video_clock_decode(stream, buf, data, len);
>>> -     uvc_video_stats_decode(stream, data, len);
>>> +     uvc_video_decode_header_size(data, &header_size, &has_pts, &has_scr);
>>> +
>>> +     uvc_video_clock_decode(stream, buf, data, len, header_size, has_pts,
>>> +                            has_scr);
>>> +     uvc_video_stats_decode(stream, data, len, header_size, has_pts,
>>> +                            has_scr);
>>>
>>>       /* Store the payload FID bit and return immediately when the buffer is
>>>        * NULL.
>>> --
>>> 2.18.0.rc2.346.g013aa6912e-goog
>>>
>>
>> --
>> Regards
>> --
>> Kieran

-- 
Regards
--
Kieran

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v1] media: uvcvideo: Cache URB header data before processing
  2018-06-27 10:34 [RFC PATCH v1] media: uvcvideo: Cache URB header data before processing Keiichi Watanabe
  2018-06-27 17:21 ` Kieran Bingham
  2018-07-24 11:03 ` Kieran Bingham
@ 2018-07-30 16:00 ` Laurent Pinchart
  2018-07-30 16:03   ` Kieran Bingham
  2018-08-08  4:08   ` Tomasz Figa
  2 siblings, 2 replies; 23+ messages in thread
From: Laurent Pinchart @ 2018-07-30 16:00 UTC (permalink / raw)
  To: Keiichi Watanabe
  Cc: linux-kernel, Mauro Carvalho Chehab, linux-media, kieran.bingham,
	tfiga, dianders, Alan Stern, Ezequiel Garcia, Matwey V. Kornilov

Hi Keiichi,

(CC'ing Alan, Ezequiel and Matwey)

Thank you for the patch.

On Wednesday, 27 June 2018 13:34:08 EEST Keiichi Watanabe wrote:
> On some platforms with non-coherent DMA (e.g. ARM), USB drivers use
> uncached memory allocation methods. In such situations, it sometimes
> takes a long time to access URB buffers.  This can be a cause of video
> flickering problems if a resolution is high and a USB controller has
> a very tight time limit. (e.g. dwc2) To avoid this problem, we copy
> header data from (uncached) URB buffer into (cached) local buffer.
> 
> This change should make the elapsed time of the interrupt handler
> shorter on platforms with non-coherent DMA. We measured the elapsed
> time of each callback of uvc_video_complete without/with this patch
> while capturing Full HD video in
> https://webrtc.github.io/samples/src/content/getusermedia/resolution/.
> I tested it on the top of Kieran Bingham's Asynchronous UVC series
> https://www.mail-archive.com/linux-media@vger.kernel.org/msg128359.html.
> The test device was Jerry Chromebook (RK3288) with Logitech Brio 4K.
> I collected data for 5 seconds. (There were around 480 callbacks in
> this case.) The following result shows that this patch makes
> uvc_video_complete about 2x faster.
> 
>            | average | median  | min     | max     | standard deviation
> 
> w/o caching| 45319ns | 40250ns | 33834ns | 142625ns| 16611ns
> w/  caching| 20620ns | 19250ns | 12250ns | 56583ns | 6285ns
> 
> In addition, we confirmed that this patch doesn't make it worse on
> coherent DMA architecture by performing the same measurements on a
> Broadwell Chromebox with the same camera.
> 
>            | average | median  | min     | max     | standard deviation
> 
> w/o caching| 21026ns | 21424ns | 12263ns | 23956ns | 1932ns
> w/  caching| 20728ns | 20398ns |  8922ns | 45120ns | 3368ns

This is very interesting, and it seems related to https://
patchwork.kernel.org/patch/10468937/. You might have seen that discussion as 
you got CC'ed at some point.

I wonder whether performances couldn't be further improved by allocating the 
URB buffers cached, as that would speed up the memcpy() as well. Have you 
tested that by any chance ?

> Signed-off-by: Keiichi Watanabe <keiichiw@chromium.org>
> ---
> 
> After applying 6 patches in
> https://www.mail-archive.com/linux-media@vger.kernel.org/msg128359.html,
> I measured elapsed time by adding the following code to
> /drivers/media/usb/uvc/uvc_video.c
> 
> @@ -XXXX,6 +XXXX,9 @@ static void uvc_video_complete(struct urb *urb)
>         struct uvc_video_queue *queue = &stream->queue;
>  	struct uvc_buffer *buf = NULL;
>  	int ret;
> +	ktime_t start, end;
> +	int elapsed_time;
> +	start = ktime_get();
>  	switch (urb->status) {
>  	case 0:
> 
> @@ -XXXX,6 +XXXX,10 @@ static void uvc_video_complete(struct urb *urb)
> 
>  	INIT_WORK(&uvc_urb->work, uvc_video_copy_data_work);
>  	queue_work(stream->async_wq, &uvc_urb->work);
> +
> +	end = ktime_get();
> +	elapsed_time = ktime_to_ns(ktime_sub(end, start));
> +	pr_err("elapsed time: %d ns", elapsed_time);
>  }
> 
>  /*
> 
> 
>  drivers/media/usb/uvc/uvc_video.c | 92 +++++++++++++++----------------
>  1 file changed, 43 insertions(+), 49 deletions(-)
> diff --git a/drivers/media/usb/uvc/uvc_video.c
> b/drivers/media/usb/uvc/uvc_video.c index a88b2e51a666..ff2eddc55530 100644
> --- a/drivers/media/usb/uvc/uvc_video.c
> +++ b/drivers/media/usb/uvc/uvc_video.c
> @@ -391,36 +391,15 @@ static inline ktime_t uvc_video_get_time(void)
> 
>  static void
>  uvc_video_clock_decode(struct uvc_streaming *stream, struct uvc_buffer
> *buf, -		       const u8 *data, int len)
> +		       const u8 *data, int len, unsigned int header_size,
> +		       bool has_pts, bool has_scr)
>  {
>  	struct uvc_clock_sample *sample;
> -	unsigned int header_size;
> -	bool has_pts = false;
> -	bool has_scr = false;
>  	unsigned long flags;
>  	ktime_t time;
>  	u16 host_sof;
>  	u16 dev_sof;
> 
> -	switch (data[1] & (UVC_STREAM_PTS | UVC_STREAM_SCR)) {
> -	case UVC_STREAM_PTS | UVC_STREAM_SCR:
> -		header_size = 12;
> -		has_pts = true;
> -		has_scr = true;
> -		break;
> -	case UVC_STREAM_PTS:
> -		header_size = 6;
> -		has_pts = true;
> -		break;
> -	case UVC_STREAM_SCR:
> -		header_size = 8;
> -		has_scr = true;
> -		break;
> -	default:
> -		header_size = 2;
> -		break;
> -	}
> -
>  	/* Check for invalid headers. */
>  	if (len < header_size)
>  		return;
> @@ -717,11 +696,10 @@ void uvc_video_clock_update(struct uvc_streaming
> *stream, */
> 
>  static void uvc_video_stats_decode(struct uvc_streaming *stream,
> -		const u8 *data, int len)
> +				   const u8 *data, int len,
> +				   unsigned int header_size, bool has_pts,
> +				   bool has_scr)
>  {
> -	unsigned int header_size;
> -	bool has_pts = false;
> -	bool has_scr = false;
>  	u16 uninitialized_var(scr_sof);
>  	u32 uninitialized_var(scr_stc);
>  	u32 uninitialized_var(pts);
> @@ -730,25 +708,6 @@ static void uvc_video_stats_decode(struct uvc_streaming
> *stream, stream->stats.frame.nb_packets == 0)
>  		stream->stats.stream.start_ts = ktime_get();
> 
> -	switch (data[1] & (UVC_STREAM_PTS | UVC_STREAM_SCR)) {
> -	case UVC_STREAM_PTS | UVC_STREAM_SCR:
> -		header_size = 12;
> -		has_pts = true;
> -		has_scr = true;
> -		break;
> -	case UVC_STREAM_PTS:
> -		header_size = 6;
> -		has_pts = true;
> -		break;
> -	case UVC_STREAM_SCR:
> -		header_size = 8;
> -		has_scr = true;
> -		break;
> -	default:
> -		header_size = 2;
> -		break;
> -	}
> -
>  	/* Check for invalid headers. */
>  	if (len < header_size || data[0] < header_size) {
>  		stream->stats.frame.nb_invalid++;
> @@ -957,10 +916,41 @@ static void uvc_video_stats_stop(struct uvc_streaming
> *stream) * to be called with a NULL buf parameter. uvc_video_decode_data
> and * uvc_video_decode_end will never be called with a NULL buffer.
>   */
> +static void uvc_video_decode_header_size(const u8 *data, int *header_size,
> +					 bool *has_pts, bool *has_scr)
> +{
> +	switch (data[1] & (UVC_STREAM_PTS | UVC_STREAM_SCR)) {
> +	case UVC_STREAM_PTS | UVC_STREAM_SCR:
> +		*header_size = 12;
> +		*has_pts = true;
> +		*has_scr = true;
> +		break;
> +	case UVC_STREAM_PTS:
> +		*header_size = 6;
> +		*has_pts = true;
> +		break;
> +	case UVC_STREAM_SCR:
> +		*header_size = 8;
> +		*has_scr = true;
> +		break;
> +	default:
> +		*header_size = 2;
> +	}
> +}
> +
>  static int uvc_video_decode_start(struct uvc_streaming *stream,
> -		struct uvc_buffer *buf, const u8 *data, int len)
> +				  struct uvc_buffer *buf, const u8 *urb_data,
> +				  int len)
>  {
>  	u8 fid;
> +	u8 data[12];
> +	unsigned int header_size;
> +	bool has_pts = false, has_scr = false;
> +
> +	/* Cache the header since urb_data is uncached memory. The size of
> +	 * header is at most 12 bytes.
> +	 */
> +	memcpy(data, urb_data, min(len, 12));
> 
>  	/* Sanity checks:
>  	 * - packet must be at least 2 bytes long
> @@ -983,8 +973,12 @@ static int uvc_video_decode_start(struct uvc_streaming
> *stream, uvc_video_stats_update(stream);
>  	}
> 
> -	uvc_video_clock_decode(stream, buf, data, len);
> -	uvc_video_stats_decode(stream, data, len);
> +	uvc_video_decode_header_size(data, &header_size, &has_pts, &has_scr);
> +
> +	uvc_video_clock_decode(stream, buf, data, len, header_size, has_pts,
> +			       has_scr);
> +	uvc_video_stats_decode(stream, data, len, header_size, has_pts,
> +			       has_scr);
> 
>  	/* Store the payload FID bit and return immediately when the buffer is
>  	 * NULL.
> --
> 2.18.0.rc2.346.g013aa6912e-goog


-- 
Regards,

Laurent Pinchart




^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v1] media: uvcvideo: Cache URB header data before processing
  2018-07-30 16:00 ` Laurent Pinchart
@ 2018-07-30 16:03   ` Kieran Bingham
  2018-08-08  4:08   ` Tomasz Figa
  1 sibling, 0 replies; 23+ messages in thread
From: Kieran Bingham @ 2018-07-30 16:03 UTC (permalink / raw)
  To: Laurent Pinchart, Keiichi Watanabe
  Cc: linux-kernel, Mauro Carvalho Chehab, linux-media, tfiga,
	dianders, Alan Stern, Ezequiel Garcia, Matwey V. Kornilov

[-- Attachment #1: Type: text/plain, Size: 8077 bytes --]

I'm attaching my debug patch in case it's useful to anyone measuring
performances.

(Recently with an extra metric added thanks to Keiichi)

Also available at:
git://git.kernel.org/pub/scm/linux/kernel/git/kbingham/rcar.git uvc/async-ml

Regards
--
Kieran

On 30/07/18 17:00, Laurent Pinchart wrote:
> Hi Keiichi,
> 
> (CC'ing Alan, Ezequiel and Matwey)
> 
> Thank you for the patch.
> 
> On Wednesday, 27 June 2018 13:34:08 EEST Keiichi Watanabe wrote:
>> On some platforms with non-coherent DMA (e.g. ARM), USB drivers use
>> uncached memory allocation methods. In such situations, it sometimes
>> takes a long time to access URB buffers.  This can be a cause of video
>> flickering problems if a resolution is high and a USB controller has
>> a very tight time limit. (e.g. dwc2) To avoid this problem, we copy
>> header data from (uncached) URB buffer into (cached) local buffer.
>>
>> This change should make the elapsed time of the interrupt handler
>> shorter on platforms with non-coherent DMA. We measured the elapsed
>> time of each callback of uvc_video_complete without/with this patch
>> while capturing Full HD video in
>> https://webrtc.github.io/samples/src/content/getusermedia/resolution/.
>> I tested it on the top of Kieran Bingham's Asynchronous UVC series
>> https://www.mail-archive.com/linux-media@vger.kernel.org/msg128359.html.
>> The test device was Jerry Chromebook (RK3288) with Logitech Brio 4K.
>> I collected data for 5 seconds. (There were around 480 callbacks in
>> this case.) The following result shows that this patch makes
>> uvc_video_complete about 2x faster.
>>
>>            | average | median  | min     | max     | standard deviation
>>
>> w/o caching| 45319ns | 40250ns | 33834ns | 142625ns| 16611ns
>> w/  caching| 20620ns | 19250ns | 12250ns | 56583ns | 6285ns
>>
>> In addition, we confirmed that this patch doesn't make it worse on
>> coherent DMA architecture by performing the same measurements on a
>> Broadwell Chromebox with the same camera.
>>
>>            | average | median  | min     | max     | standard deviation
>>
>> w/o caching| 21026ns | 21424ns | 12263ns | 23956ns | 1932ns
>> w/  caching| 20728ns | 20398ns |  8922ns | 45120ns | 3368ns
> 
> This is very interesting, and it seems related to https://
> patchwork.kernel.org/patch/10468937/. You might have seen that discussion as 
> you got CC'ed at some point.
> 
> I wonder whether performances couldn't be further improved by allocating the 
> URB buffers cached, as that would speed up the memcpy() as well. Have you 
> tested that by any chance ?
> 
>> Signed-off-by: Keiichi Watanabe <keiichiw@chromium.org>
>> ---
>>
>> After applying 6 patches in
>> https://www.mail-archive.com/linux-media@vger.kernel.org/msg128359.html,
>> I measured elapsed time by adding the following code to
>> /drivers/media/usb/uvc/uvc_video.c
>>
>> @@ -XXXX,6 +XXXX,9 @@ static void uvc_video_complete(struct urb *urb)
>>         struct uvc_video_queue *queue = &stream->queue;
>>  	struct uvc_buffer *buf = NULL;
>>  	int ret;
>> +	ktime_t start, end;
>> +	int elapsed_time;
>> +	start = ktime_get();
>>  	switch (urb->status) {
>>  	case 0:
>>
>> @@ -XXXX,6 +XXXX,10 @@ static void uvc_video_complete(struct urb *urb)
>>
>>  	INIT_WORK(&uvc_urb->work, uvc_video_copy_data_work);
>>  	queue_work(stream->async_wq, &uvc_urb->work);
>> +
>> +	end = ktime_get();
>> +	elapsed_time = ktime_to_ns(ktime_sub(end, start));
>> +	pr_err("elapsed time: %d ns", elapsed_time);
>>  }
>>
>>  /*
>>
>>
>>  drivers/media/usb/uvc/uvc_video.c | 92 +++++++++++++++----------------
>>  1 file changed, 43 insertions(+), 49 deletions(-)
>> diff --git a/drivers/media/usb/uvc/uvc_video.c
>> b/drivers/media/usb/uvc/uvc_video.c index a88b2e51a666..ff2eddc55530 100644
>> --- a/drivers/media/usb/uvc/uvc_video.c
>> +++ b/drivers/media/usb/uvc/uvc_video.c
>> @@ -391,36 +391,15 @@ static inline ktime_t uvc_video_get_time(void)
>>
>>  static void
>>  uvc_video_clock_decode(struct uvc_streaming *stream, struct uvc_buffer
>> *buf, -		       const u8 *data, int len)
>> +		       const u8 *data, int len, unsigned int header_size,
>> +		       bool has_pts, bool has_scr)
>>  {
>>  	struct uvc_clock_sample *sample;
>> -	unsigned int header_size;
>> -	bool has_pts = false;
>> -	bool has_scr = false;
>>  	unsigned long flags;
>>  	ktime_t time;
>>  	u16 host_sof;
>>  	u16 dev_sof;
>>
>> -	switch (data[1] & (UVC_STREAM_PTS | UVC_STREAM_SCR)) {
>> -	case UVC_STREAM_PTS | UVC_STREAM_SCR:
>> -		header_size = 12;
>> -		has_pts = true;
>> -		has_scr = true;
>> -		break;
>> -	case UVC_STREAM_PTS:
>> -		header_size = 6;
>> -		has_pts = true;
>> -		break;
>> -	case UVC_STREAM_SCR:
>> -		header_size = 8;
>> -		has_scr = true;
>> -		break;
>> -	default:
>> -		header_size = 2;
>> -		break;
>> -	}
>> -
>>  	/* Check for invalid headers. */
>>  	if (len < header_size)
>>  		return;
>> @@ -717,11 +696,10 @@ void uvc_video_clock_update(struct uvc_streaming
>> *stream, */
>>
>>  static void uvc_video_stats_decode(struct uvc_streaming *stream,
>> -		const u8 *data, int len)
>> +				   const u8 *data, int len,
>> +				   unsigned int header_size, bool has_pts,
>> +				   bool has_scr)
>>  {
>> -	unsigned int header_size;
>> -	bool has_pts = false;
>> -	bool has_scr = false;
>>  	u16 uninitialized_var(scr_sof);
>>  	u32 uninitialized_var(scr_stc);
>>  	u32 uninitialized_var(pts);
>> @@ -730,25 +708,6 @@ static void uvc_video_stats_decode(struct uvc_streaming
>> *stream, stream->stats.frame.nb_packets == 0)
>>  		stream->stats.stream.start_ts = ktime_get();
>>
>> -	switch (data[1] & (UVC_STREAM_PTS | UVC_STREAM_SCR)) {
>> -	case UVC_STREAM_PTS | UVC_STREAM_SCR:
>> -		header_size = 12;
>> -		has_pts = true;
>> -		has_scr = true;
>> -		break;
>> -	case UVC_STREAM_PTS:
>> -		header_size = 6;
>> -		has_pts = true;
>> -		break;
>> -	case UVC_STREAM_SCR:
>> -		header_size = 8;
>> -		has_scr = true;
>> -		break;
>> -	default:
>> -		header_size = 2;
>> -		break;
>> -	}
>> -
>>  	/* Check for invalid headers. */
>>  	if (len < header_size || data[0] < header_size) {
>>  		stream->stats.frame.nb_invalid++;
>> @@ -957,10 +916,41 @@ static void uvc_video_stats_stop(struct uvc_streaming
>> *stream) * to be called with a NULL buf parameter. uvc_video_decode_data
>> and * uvc_video_decode_end will never be called with a NULL buffer.
>>   */
>> +static void uvc_video_decode_header_size(const u8 *data, int *header_size,
>> +					 bool *has_pts, bool *has_scr)
>> +{
>> +	switch (data[1] & (UVC_STREAM_PTS | UVC_STREAM_SCR)) {
>> +	case UVC_STREAM_PTS | UVC_STREAM_SCR:
>> +		*header_size = 12;
>> +		*has_pts = true;
>> +		*has_scr = true;
>> +		break;
>> +	case UVC_STREAM_PTS:
>> +		*header_size = 6;
>> +		*has_pts = true;
>> +		break;
>> +	case UVC_STREAM_SCR:
>> +		*header_size = 8;
>> +		*has_scr = true;
>> +		break;
>> +	default:
>> +		*header_size = 2;
>> +	}
>> +}
>> +
>>  static int uvc_video_decode_start(struct uvc_streaming *stream,
>> -		struct uvc_buffer *buf, const u8 *data, int len)
>> +				  struct uvc_buffer *buf, const u8 *urb_data,
>> +				  int len)
>>  {
>>  	u8 fid;
>> +	u8 data[12];
>> +	unsigned int header_size;
>> +	bool has_pts = false, has_scr = false;
>> +
>> +	/* Cache the header since urb_data is uncached memory. The size of
>> +	 * header is at most 12 bytes.
>> +	 */
>> +	memcpy(data, urb_data, min(len, 12));
>>
>>  	/* Sanity checks:
>>  	 * - packet must be at least 2 bytes long
>> @@ -983,8 +973,12 @@ static int uvc_video_decode_start(struct uvc_streaming
>> *stream, uvc_video_stats_update(stream);
>>  	}
>>
>> -	uvc_video_clock_decode(stream, buf, data, len);
>> -	uvc_video_stats_decode(stream, data, len);
>> +	uvc_video_decode_header_size(data, &header_size, &has_pts, &has_scr);
>> +
>> +	uvc_video_clock_decode(stream, buf, data, len, header_size, has_pts,
>> +			       has_scr);
>> +	uvc_video_stats_decode(stream, data, len, header_size, has_pts,
>> +			       has_scr);
>>
>>  	/* Store the payload FID bit and return immediately when the buffer is
>>  	 * NULL.
>> --
>> 2.18.0.rc2.346.g013aa6912e-goog
> 
> 

-- 
Regards
--
Kieran

[-- Attachment #2: 0001-uvcvideo-debug-Add-statistics-for-measuring-performa.patch --]
[-- Type: text/x-patch, Size: 10707 bytes --]

From cebbd1b629bbe5f856ec5dc7591478c003f5a944 Mon Sep 17 00:00:00 2001
From: Kieran Bingham <kieran.bingham@ideasonboard.com>
Date: Wed, 22 Mar 2017 09:54:11 +0000
Subject: [PATCH] uvcvideo: debug: Add statistics for measuring performance

Measuring the performance of the uvcvideo driver requires introspection
into the internal state.

Extend the information provided at:
 /sys/kernel/debug/usb/uvcvideo/${bus}-${dev}/stats

to track the overall data throughput as well as URB usage and processing
speeds.

This patch is not intended for integration into any release tree, but is
provided as-is to assist in measuring the performance while fine tuning
specific use-cases of the driver.

Signed-off-by: Kieran Bingham <kieran.bingham@ideasonboard.com>
---

 v2:
   - uvcvideo: debug: Convert to using ktime_t
   - uvcvideo: debug: Track header processing time
     (H/T Keiichi Watanabe <keiichiw@chromium.org>

Signed-off-by: Kieran Bingham <kieran@bingham.xyz>
---
 drivers/media/usb/uvc/uvc_video.c | 153 +++++++++++++++++++++++++++++-
 drivers/media/usb/uvc/uvcvideo.h  |  20 ++++
 2 files changed, 171 insertions(+), 2 deletions(-)

diff --git a/drivers/media/usb/uvc/uvc_video.c b/drivers/media/usb/uvc/uvc_video.c
index a4222d8d09a5..40943890668b 100644
--- a/drivers/media/usb/uvc/uvc_video.c
+++ b/drivers/media/usb/uvc/uvc_video.c
@@ -863,12 +863,56 @@ static void uvc_video_stats_update(struct uvc_streaming *stream)
 	memset(&stream->stats.frame, 0, sizeof(stream->stats.frame));
 }
 
+size_t uvc_video_dump_time_stats(char *buf, size_t size,
+				 struct uvc_stats_time *stat, const char *pfx)
+{
+	unsigned int avg = 0;
+
+	if (stat->qty)
+		avg = stat->duration / stat->qty;
+
+	/* Stat durations are in nanoseconds, we present in micro-seconds */
+	return scnprintf(buf, size,
+			 "%s: %llu/%u uS/qty: %u.%03u avg %u.%03u min %u.%03u max (uS)\n",
+			 pfx, stat->duration / 1000, stat->qty,
+			  avg / 1000, avg % 1000,
+			  stat->min / 1000, stat->min % 1000,
+			  stat->max / 1000, stat->max % 1000);
+}
+
+size_t uvc_video_dump_speed(char *buf, size_t size, const char *pfx,
+			    u64 bytes, u64 milliseconds)
+{
+	unsigned int rate = 0;
+	bool gbit = false;
+
+	if (milliseconds)
+		rate = bytes * 8 / milliseconds;
+
+	if (rate >= 1000000) {
+		gbit = true;
+		rate /= 1000;
+	}
+
+	/*
+	* bits/milliseconds == kilobits/seconds,
+	* presented here as Mbits/s (or Gbit/s) with 3 decimal places
+	*/
+	return scnprintf(buf, size, "%s: %d.%03d %sbits/s\n",
+			 pfx, rate / 1000, rate % 1000,
+			 gbit ? "G" : "M");
+}
+
 size_t uvc_video_stats_dump(struct uvc_streaming *stream, char *buf,
 			    size_t size)
 {
+	u64 bytes = stream->stats.stream.bytes; /* Single sample */
+	unsigned int empty_ratio = 0;
 	unsigned int scr_sof_freq;
 	unsigned int duration;
+	unsigned int fps = 0;
 	size_t count = 0;
+	u64 cpu = 0;
 
 	/* Compute the SCR.SOF frequency estimate. At the nominal 1kHz SOF
 	 * frequency this will not overflow before more than 1h.
@@ -881,12 +925,20 @@ size_t uvc_video_stats_dump(struct uvc_streaming *stream, char *buf,
 	else
 		scr_sof_freq = 0;
 
+	if (stream->stats.stream.nb_packets)
+		empty_ratio = stream->stats.stream.nb_empty * 100 /
+			stream->stats.stream.nb_packets;
+
 	count += scnprintf(buf + count, size - count,
-			   "frames:  %u\npackets: %u\nempty:   %u\n"
-			   "errors:  %u\ninvalid: %u\n",
+			   "frames:  %u\n"
+			   "packets: %u\n"
+			   "empty:   %u (%u %%)\n"
+			   "errors:  %u\n"
+			   "invalid: %u\n",
 			   stream->stats.stream.nb_frames,
 			   stream->stats.stream.nb_packets,
 			   stream->stats.stream.nb_empty,
+			   empty_ratio,
 			   stream->stats.stream.nb_errors,
 			   stream->stats.stream.nb_invalid);
 	count += scnprintf(buf + count, size - count,
@@ -903,6 +955,53 @@ size_t uvc_video_stats_dump(struct uvc_streaming *stream, char *buf,
 			   stream->stats.stream.min_sof,
 			   stream->stats.stream.max_sof,
 			   scr_sof_freq / 1000, scr_sof_freq % 1000);
+	count += scnprintf(buf + count, size - count,
+			   "bytes %lld : duration %d\n",
+			   bytes, duration);
+
+	if (duration != 0) {
+		/* Duration is in milliseconds, * 100 to gain 2 dp precision */
+		fps = stream->stats.stream.nb_frames * 1000 * 100 / duration;
+		/* CPU usage as a % with 6 decimal places */
+		cpu = stream->stats.urbstat.decode.duration / duration * 100;
+	}
+
+	count += scnprintf(buf + count, size - count,
+			   "FPS: %u.%02u\n", fps / 100, fps % 100);
+
+	/* Processing Times */
+
+	count += uvc_video_dump_time_stats(buf + count, size - count,
+			   &stream->stats.urbstat.urb, "URB");
+	count += uvc_video_dump_time_stats(buf + count, size - count,
+			   &stream->stats.urbstat.header, "header");
+	count += uvc_video_dump_time_stats(buf + count, size - count,
+			   &stream->stats.urbstat.latency, "latency");
+	count += uvc_video_dump_time_stats(buf + count, size - count,
+			   &stream->stats.urbstat.decode, "decode");
+
+	/* Processing Speeds */
+
+	/* This should be representative of the memory bus / cpu speed */
+	count += uvc_video_dump_speed(buf + count, size - count,
+			   "raw decode speed",
+			   bytes,
+			   stream->stats.urbstat.decode.duration / 1000000);
+
+	/* Raw bus speed - scheduling latencies */
+	count += uvc_video_dump_speed(buf + count, size - count,
+			   "raw URB handling speed",
+			   bytes,
+			   stream->stats.urbstat.urb.duration / 1000000);
+
+	/* Throughput against wall clock time, stream duration is in millis*/
+	count += uvc_video_dump_speed(buf + count, size - count,
+			   "throughput", bytes, duration);
+
+	/* Determine the 'CPU Usage' of our URB processing */
+	count += scnprintf(buf + count, size - count,
+			   "URB decode CPU usage %llu.%06llu %%\n",
+			   cpu / 1000000, cpu % 1000000);
 
 	return count;
 }
@@ -911,6 +1010,10 @@ static void uvc_video_stats_start(struct uvc_streaming *stream)
 {
 	memset(&stream->stats, 0, sizeof(stream->stats));
 	stream->stats.stream.min_sof = 2048;
+
+	stream->stats.urbstat.urb.min = -1;
+	stream->stats.urbstat.latency.min = -1;
+	stream->stats.urbstat.decode.min = -1;
 }
 
 static void uvc_video_stats_stop(struct uvc_streaming *stream)
@@ -918,6 +1021,27 @@ static void uvc_video_stats_stop(struct uvc_streaming *stream)
 	stream->stats.stream.stop_ts = ktime_get();
 }
 
+static s64 uvc_stats_add(struct uvc_stats_time *s,
+			 const ktime_t a, const ktime_t b)
+{
+	ktime_t delta;
+	u64 duration;
+
+	delta = ktime_sub(b, a);
+	duration = ktime_to_ns(delta);
+
+	s->qty++;
+	s->duration += duration;
+
+	if (duration < s->min)
+		s->min = duration;
+
+	if (duration > s->max)
+		s->max = duration;
+
+	return duration;
+}
+
 /* ------------------------------------------------------------------------
  * Video codecs
  */
@@ -981,6 +1105,9 @@ static int uvc_video_decode_start(struct uvc_streaming *stream,
 		stream->sequence++;
 		if (stream->sequence)
 			uvc_video_stats_update(stream);
+
+		/* Update the stream timer each frame */
+		stream->stats.stream.stop_ts = ktime_get();
 	}
 
 	uvc_video_clock_decode(stream, buf, data, len);
@@ -1063,18 +1190,34 @@ static int uvc_video_decode_start(struct uvc_streaming *stream,
 static void uvc_video_copy_data_work(struct work_struct *work)
 {
 	struct uvc_urb *uvc_urb = container_of(work, struct uvc_urb, work);
+	ktime_t now;
 	unsigned int i;
 	int ret;
 
+	/* Measure decode performance */
+	uvc_urb->decode_start = ktime_get();
+	/* Measure scheduling latency */
+	uvc_stats_add(&uvc_urb->stream->stats.urbstat.latency,
+		      uvc_urb->received, uvc_urb->decode_start);
+
 	for (i = 0; i < uvc_urb->async_operations; i++) {
 		struct uvc_copy_op *op = &uvc_urb->copy_operations[i];
 
 		memcpy(op->dst, op->src, op->len);
+		uvc_urb->stream->stats.stream.bytes += op->len;
 
 		/* Release reference taken on this buffer */
 		uvc_queue_buffer_release(op->buf);
 	}
 
+	now = ktime_get();
+	/* measure 'memcpy time' */
+	uvc_stats_add(&uvc_urb->stream->stats.urbstat.decode,
+		      uvc_urb->decode_start, now);
+	/* measure 'full urb processing time' */
+	uvc_stats_add(&uvc_urb->stream->stats.urbstat.urb,
+		      uvc_urb->received, now);
+
 	ret = usb_submit_urb(uvc_urb->urb, GFP_ATOMIC);
 	if (ret < 0)
 		uvc_printk(KERN_ERR, "Failed to resubmit video URB (%d).\n",
@@ -1461,6 +1604,9 @@ static void uvc_video_complete(struct urb *urb)
 	unsigned long flags;
 	int ret;
 
+	/* Track URB processing performance */
+	uvc_urb->received = ktime_get();
+
 	switch (urb->status) {
 	case 0:
 		break;
@@ -1512,6 +1658,9 @@ static void uvc_video_complete(struct urb *urb)
 
 	INIT_WORK(&uvc_urb->work, uvc_video_copy_data_work);
 	queue_work(stream->async_wq, &uvc_urb->work);
+
+	uvc_stats_add(&uvc_urb->stream->stats.urbstat.header,
+		      uvc_urb->received, ktime_get());
 }
 
 /*
diff --git a/drivers/media/usb/uvc/uvcvideo.h b/drivers/media/usb/uvc/uvcvideo.h
index 7ebeb680710b..efaa55a24a68 100644
--- a/drivers/media/usb/uvc/uvcvideo.h
+++ b/drivers/media/usb/uvc/uvcvideo.h
@@ -462,6 +462,13 @@ struct uvc_stats_frame {
 	u32 scr_stc;			/* SCR.STC of the last packet */
 };
 
+struct uvc_stats_time {
+	u64 duration;			/* Cumulative total duration between two events */
+	unsigned int qty;		/* Number of events represented in the total */
+	unsigned int min;		/* Shortest duration */
+	unsigned int max;		/* Longest duration */
+};
+
 struct uvc_stats_stream {
 	ktime_t start_ts;		/* Stream start timestamp */
 	ktime_t stop_ts;		/* Stream stop timestamp */
@@ -483,6 +490,8 @@ struct uvc_stats_stream {
 	unsigned int scr_sof;		/* STC.SOF of the last packet */
 	unsigned int min_sof;		/* Minimum STC.SOF value */
 	unsigned int max_sof;		/* Maximum STC.SOF value */
+
+	unsigned long bytes;		/* Successfully transferred bytes */
 };
 
 #define UVC_METATADA_BUF_SIZE 1024
@@ -512,6 +521,8 @@ struct uvc_copy_op {
  * @async_operations: counter to indicate the number of copy operations
  * @copy_operations: work descriptors for asynchronous copy operations
  * @work: work queue entry for asynchronous decode
+ * @received: URB interrupt time stamp
+ * @decode_start: URB processing start time stamp
  */
 struct uvc_urb {
 	struct urb *urb;
@@ -523,6 +534,9 @@ struct uvc_urb {
 	unsigned int async_operations;
 	struct uvc_copy_op copy_operations[UVC_MAX_PACKETS];
 	struct work_struct work;
+
+	ktime_t received;
+	ktime_t decode_start;
 };
 
 struct uvc_streaming {
@@ -585,6 +599,12 @@ struct uvc_streaming {
 	struct {
 		struct uvc_stats_frame frame;
 		struct uvc_stats_stream stream;
+		struct uvc_stats_urb {
+			struct uvc_stats_time header;
+			struct uvc_stats_time latency;
+			struct uvc_stats_time decode;
+			struct uvc_stats_time urb;
+		} urbstat;
 	} stats;
 
 	/* Timestamps support. */
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v1] media: uvcvideo: Cache URB header data before processing
  2018-07-30 16:00 ` Laurent Pinchart
  2018-07-30 16:03   ` Kieran Bingham
@ 2018-08-08  4:08   ` Tomasz Figa
  2018-08-08  8:42     ` Laurent Pinchart
  1 sibling, 1 reply; 23+ messages in thread
From: Tomasz Figa @ 2018-08-08  4:08 UTC (permalink / raw)
  To: Laurent Pinchart
  Cc: keiichiw, Linux Kernel Mailing List, Mauro Carvalho Chehab,
	Linux Media Mailing List, Kieran Bingham, Douglas Anderson,
	Alan Stern, Ezequiel Garcia, Matwey V. Kornilov

On Tue, Jul 31, 2018 at 1:00 AM Laurent Pinchart
<laurent.pinchart@ideasonboard.com> wrote:
>
> Hi Keiichi,
>
> (CC'ing Alan, Ezequiel and Matwey)
>
> Thank you for the patch.
>
> On Wednesday, 27 June 2018 13:34:08 EEST Keiichi Watanabe wrote:
> > On some platforms with non-coherent DMA (e.g. ARM), USB drivers use
> > uncached memory allocation methods. In such situations, it sometimes
> > takes a long time to access URB buffers.  This can be a cause of video
> > flickering problems if a resolution is high and a USB controller has
> > a very tight time limit. (e.g. dwc2) To avoid this problem, we copy
> > header data from (uncached) URB buffer into (cached) local buffer.
> >
> > This change should make the elapsed time of the interrupt handler
> > shorter on platforms with non-coherent DMA. We measured the elapsed
> > time of each callback of uvc_video_complete without/with this patch
> > while capturing Full HD video in
> > https://webrtc.github.io/samples/src/content/getusermedia/resolution/.
> > I tested it on the top of Kieran Bingham's Asynchronous UVC series
> > https://www.mail-archive.com/linux-media@vger.kernel.org/msg128359.html.
> > The test device was Jerry Chromebook (RK3288) with Logitech Brio 4K.
> > I collected data for 5 seconds. (There were around 480 callbacks in
> > this case.) The following result shows that this patch makes
> > uvc_video_complete about 2x faster.
> >
> >            | average | median  | min     | max     | standard deviation
> >
> > w/o caching| 45319ns | 40250ns | 33834ns | 142625ns| 16611ns
> > w/  caching| 20620ns | 19250ns | 12250ns | 56583ns | 6285ns
> >
> > In addition, we confirmed that this patch doesn't make it worse on
> > coherent DMA architecture by performing the same measurements on a
> > Broadwell Chromebox with the same camera.
> >
> >            | average | median  | min     | max     | standard deviation
> >
> > w/o caching| 21026ns | 21424ns | 12263ns | 23956ns | 1932ns
> > w/  caching| 20728ns | 20398ns |  8922ns | 45120ns | 3368ns
>
> This is very interesting, and it seems related to https://
> patchwork.kernel.org/patch/10468937/. You might have seen that discussion as
> you got CC'ed at some point.
>
> I wonder whether performances couldn't be further improved by allocating the
> URB buffers cached, as that would speed up the memcpy() as well. Have you
> tested that by any chance ?

We haven't measure it, but the issue being solved here was indeed
significantly reduced by using cached URB buffers, even without
Kieran's async series. After we discovered the latter, we just
backported it and decided to further tweak the last remaining bit, to
avoid playing too much with the DMA API in code used in production on
several different platforms (including both ARM and x86).

If you think we could change the driver to use cached buffers instead
(as the pwc driver mentioned in another thread), I wouldn't have
anything against it obviously.

Best regards,
Tomasz

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v1] media: uvcvideo: Cache URB header data before processing
  2018-08-08  4:08   ` Tomasz Figa
@ 2018-08-08  8:42     ` Laurent Pinchart
  2018-08-08 12:45       ` Keiichi Watanabe
  0 siblings, 1 reply; 23+ messages in thread
From: Laurent Pinchart @ 2018-08-08  8:42 UTC (permalink / raw)
  To: Tomasz Figa
  Cc: keiichiw, Linux Kernel Mailing List, Mauro Carvalho Chehab,
	Linux Media Mailing List, Kieran Bingham, Douglas Anderson,
	Alan Stern, Ezequiel Garcia, Matwey V. Kornilov

Hi Tomasz,

On Wednesday, 8 August 2018 07:08:59 EEST Tomasz Figa wrote:
> On Tue, Jul 31, 2018 at 1:00 AM Laurent Pinchart wrote:
> > On Wednesday, 27 June 2018 13:34:08 EEST Keiichi Watanabe wrote:
> >> On some platforms with non-coherent DMA (e.g. ARM), USB drivers use
> >> uncached memory allocation methods. In such situations, it sometimes
> >> takes a long time to access URB buffers.  This can be a cause of video
> >> flickering problems if a resolution is high and a USB controller has
> >> a very tight time limit. (e.g. dwc2) To avoid this problem, we copy
> >> header data from (uncached) URB buffer into (cached) local buffer.
> >> 
> >> This change should make the elapsed time of the interrupt handler
> >> shorter on platforms with non-coherent DMA. We measured the elapsed
> >> time of each callback of uvc_video_complete without/with this patch
> >> while capturing Full HD video in
> >> https://webrtc.github.io/samples/src/content/getusermedia/resolution/.
> >> I tested it on the top of Kieran Bingham's Asynchronous UVC series
> >> https://www.mail-archive.com/linux-media@vger.kernel.org/msg128359.html.
> >> The test device was Jerry Chromebook (RK3288) with Logitech Brio 4K.
> >> I collected data for 5 seconds. (There were around 480 callbacks in
> >> this case.) The following result shows that this patch makes
> >> uvc_video_complete about 2x faster.
> >> 
> >>            | average | median  | min     | max     | standard deviation
> >> w/o caching| 45319ns | 40250ns | 33834ns | 142625ns| 16611ns
> >> w/  caching| 20620ns | 19250ns | 12250ns | 56583ns | 6285ns
> >> 
> >> In addition, we confirmed that this patch doesn't make it worse on
> >> coherent DMA architecture by performing the same measurements on a
> >> Broadwell Chromebox with the same camera.
> >> 
> >>            | average | median  | min     | max     | standard deviation
> >> w/o caching| 21026ns | 21424ns | 12263ns | 23956ns | 1932ns
> >> w/  caching| 20728ns | 20398ns |  8922ns | 45120ns | 3368ns
> > 
> > This is very interesting, and it seems related to https://
> > patchwork.kernel.org/patch/10468937/. You might have seen that discussion
> > as you got CC'ed at some point.
> > 
> > I wonder whether performances couldn't be further improved by allocating
> > the URB buffers cached, as that would speed up the memcpy() as well. Have
> > you tested that by any chance ?
> 
> We haven't measure it, but the issue being solved here was indeed
> significantly reduced by using cached URB buffers, even without
> Kieran's async series. After we discovered the latter, we just
> backported it and decided to further tweak the last remaining bit, to
> avoid playing too much with the DMA API in code used in production on
> several different platforms (including both ARM and x86).
> 
> If you think we could change the driver to use cached buffers instead
> (as the pwc driver mentioned in another thread), I wouldn't have
> anything against it obviously.

I think there's a chance that performances could be further improved. 
Furthermore, it would lean to simpler code as we wouldn't need to deal with 
caching headers manually. I would however like to see numbers before making a 
decision.

-- 
Regards,

Laurent Pinchart




^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v1] media: uvcvideo: Cache URB header data before processing
  2018-08-08  8:42     ` Laurent Pinchart
@ 2018-08-08 12:45       ` Keiichi Watanabe
  2018-08-08 13:07         ` Kieran Bingham
  2018-08-08 14:20         ` Alan Stern
  0 siblings, 2 replies; 23+ messages in thread
From: Keiichi Watanabe @ 2018-08-08 12:45 UTC (permalink / raw)
  To: Laurent Pinchart
  Cc: Tomasz Figa, Linux Kernel Mailing List, Mauro Carvalho Chehab,
	Linux Media Mailing List, kieran.bingham, Douglas Anderson,
	stern, ezequiel, matwey

Hi Laurent, Kieran, Tomasz,

Thank you for reviews and suggestions.
I want to do additional measurements for improving the performance.

Let me clarify my understanding:
Currently, if the platform doesn't support coherent-DMA (e.g. ARM),
urb_buffer is allocated by usb_alloc_coherent with
URB_NO_TRANSFER_DMA_MAP flag instead of using kmalloc.
This is because we want to avoid frequent DMA mappings, which are
generally expensive.
However, memories allocated in this way are not cached.

So, we wonder if using usb_alloc_coherent is really fast.
In other words, we want to know which is better:
"No DMA mapping/Uncached memory" v.s. "Frequent DMA mapping/Cached memory".

For this purpose, I'm planning to measure performance on ARM
Chromebooks in the following conditions:
1. Current implementation with Kieran's patches
2. 1. + my patch
3. Use kmalloc instead

1 and 2 are the same conditions I reported in the first mail on this thread.
For condition 3, I only have to add "#define CONFIG_DMA_NONCOHERENT"
at the beginning of uvc_video.c.

Does this plan sound reasonable?

Best regards,
Keiichi
On Wed, Aug 8, 2018 at 5:42 PM Laurent Pinchart
<laurent.pinchart@ideasonboard.com> wrote:
>
> Hi Tomasz,
>
> On Wednesday, 8 August 2018 07:08:59 EEST Tomasz Figa wrote:
> > On Tue, Jul 31, 2018 at 1:00 AM Laurent Pinchart wrote:
> > > On Wednesday, 27 June 2018 13:34:08 EEST Keiichi Watanabe wrote:
> > >> On some platforms with non-coherent DMA (e.g. ARM), USB drivers use
> > >> uncached memory allocation methods. In such situations, it sometimes
> > >> takes a long time to access URB buffers.  This can be a cause of video
> > >> flickering problems if a resolution is high and a USB controller has
> > >> a very tight time limit. (e.g. dwc2) To avoid this problem, we copy
> > >> header data from (uncached) URB buffer into (cached) local buffer.
> > >>
> > >> This change should make the elapsed time of the interrupt handler
> > >> shorter on platforms with non-coherent DMA. We measured the elapsed
> > >> time of each callback of uvc_video_complete without/with this patch
> > >> while capturing Full HD video in
> > >> https://webrtc.github.io/samples/src/content/getusermedia/resolution/.
> > >> I tested it on the top of Kieran Bingham's Asynchronous UVC series
> > >> https://www.mail-archive.com/linux-media@vger.kernel.org/msg128359.html.
> > >> The test device was Jerry Chromebook (RK3288) with Logitech Brio 4K.
> > >> I collected data for 5 seconds. (There were around 480 callbacks in
> > >> this case.) The following result shows that this patch makes
> > >> uvc_video_complete about 2x faster.
> > >>
> > >>            | average | median  | min     | max     | standard deviation
> > >> w/o caching| 45319ns | 40250ns | 33834ns | 142625ns| 16611ns
> > >> w/  caching| 20620ns | 19250ns | 12250ns | 56583ns | 6285ns
> > >>
> > >> In addition, we confirmed that this patch doesn't make it worse on
> > >> coherent DMA architecture by performing the same measurements on a
> > >> Broadwell Chromebox with the same camera.
> > >>
> > >>            | average | median  | min     | max     | standard deviation
> > >> w/o caching| 21026ns | 21424ns | 12263ns | 23956ns | 1932ns
> > >> w/  caching| 20728ns | 20398ns |  8922ns | 45120ns | 3368ns
> > >
> > > This is very interesting, and it seems related to https://
> > > patchwork.kernel.org/patch/10468937/. You might have seen that discussion
> > > as you got CC'ed at some point.
> > >
> > > I wonder whether performances couldn't be further improved by allocating
> > > the URB buffers cached, as that would speed up the memcpy() as well. Have
> > > you tested that by any chance ?
> >
> > We haven't measure it, but the issue being solved here was indeed
> > significantly reduced by using cached URB buffers, even without
> > Kieran's async series. After we discovered the latter, we just
> > backported it and decided to further tweak the last remaining bit, to
> > avoid playing too much with the DMA API in code used in production on
> > several different platforms (including both ARM and x86).
> >
> > If you think we could change the driver to use cached buffers instead
> > (as the pwc driver mentioned in another thread), I wouldn't have
> > anything against it obviously.
>
> I think there's a chance that performances could be further improved.
> Furthermore, it would lean to simpler code as we wouldn't need to deal with
> caching headers manually. I would however like to see numbers before making a
> decision.
>
> --
> Regards,
>
> Laurent Pinchart
>
>
>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v1] media: uvcvideo: Cache URB header data before processing
  2018-08-08 12:45       ` Keiichi Watanabe
@ 2018-08-08 13:07         ` Kieran Bingham
  2018-08-08 14:02           ` Keiichi Watanabe
  2018-08-08 14:20         ` Alan Stern
  1 sibling, 1 reply; 23+ messages in thread
From: Kieran Bingham @ 2018-08-08 13:07 UTC (permalink / raw)
  To: Keiichi Watanabe, Laurent Pinchart
  Cc: Tomasz Figa, Linux Kernel Mailing List, Mauro Carvalho Chehab,
	Linux Media Mailing List, Douglas Anderson, stern, ezequiel,
	matwey

Hi All,

On 08/08/18 13:45, Keiichi Watanabe wrote:
> Hi Laurent, Kieran, Tomasz,
> 
> Thank you for reviews and suggestions.
> I want to do additional measurements for improving the performance.
> 
> Let me clarify my understanding:
> Currently, if the platform doesn't support coherent-DMA (e.g. ARM),
> urb_buffer is allocated by usb_alloc_coherent with
> URB_NO_TRANSFER_DMA_MAP flag instead of using kmalloc.
> This is because we want to avoid frequent DMA mappings, which are
> generally expensive.
> However, memories allocated in this way are not cached.
> 
> So, we wonder if using usb_alloc_coherent is really fast.
> In other words, we want to know which is better:
> "No DMA mapping/Uncached memory" v.s. "Frequent DMA mapping/Cached memory".
> 
> For this purpose, I'm planning to measure performance on ARM
> Chromebooks in the following conditions:
> 1. Current implementation with Kieran's patches
> 2. 1. + my patch
> 3. Use kmalloc instead
> 
> 1 and 2 are the same conditions I reported in the first mail on this thread.
> For condition 3, I only have to add "#define CONFIG_DMA_NONCOHERENT"
> at the beginning of uvc_video.c.

I'd be interested in numbers/performances both with and without my async
if possible too.

The async path can be easily disabled temporarily with the following
change: (perhaps this should be a module option?)

diff --git a/drivers/media/usb/uvc/uvc_video.c
b/drivers/media/usb/uvc/uvc_video.c
index 8bb6e90f3483..f9fbdc9bfa4b 100644
--- a/drivers/media/usb/uvc/uvc_video.c
+++ b/drivers/media/usb/uvc/uvc_video.c
@@ -1505,7 +1505,8 @@ static void uvc_video_complete(struct urb *urb)
        }

        INIT_WORK(&uvc_urb->work, uvc_video_copy_data_work);
-       queue_work(stream->async_wq, &uvc_urb->work);
+//     queue_work(stream->async_wq, &uvc_urb->work);
+       uvc_video_copy_data_work(&uvc_urb->work);
 }

 /*


I do suspect that even with cached buffers, it's probably likely we
should still consider the async patches to move the memcopy out of
interrupt context.

--
Regards

Kieran




> 
> Does this plan sound reasonable?
> 
> Best regards,
> Keiichi
> On Wed, Aug 8, 2018 at 5:42 PM Laurent Pinchart
> <laurent.pinchart@ideasonboard.com> wrote:
>>
>> Hi Tomasz,
>>
>> On Wednesday, 8 August 2018 07:08:59 EEST Tomasz Figa wrote:
>>> On Tue, Jul 31, 2018 at 1:00 AM Laurent Pinchart wrote:
>>>> On Wednesday, 27 June 2018 13:34:08 EEST Keiichi Watanabe wrote:
>>>>> On some platforms with non-coherent DMA (e.g. ARM), USB drivers use
>>>>> uncached memory allocation methods. In such situations, it sometimes
>>>>> takes a long time to access URB buffers.  This can be a cause of video
>>>>> flickering problems if a resolution is high and a USB controller has
>>>>> a very tight time limit. (e.g. dwc2) To avoid this problem, we copy
>>>>> header data from (uncached) URB buffer into (cached) local buffer.
>>>>>
>>>>> This change should make the elapsed time of the interrupt handler
>>>>> shorter on platforms with non-coherent DMA. We measured the elapsed
>>>>> time of each callback of uvc_video_complete without/with this patch
>>>>> while capturing Full HD video in
>>>>> https://webrtc.github.io/samples/src/content/getusermedia/resolution/.
>>>>> I tested it on the top of Kieran Bingham's Asynchronous UVC series
>>>>> https://www.mail-archive.com/linux-media@vger.kernel.org/msg128359.html.
>>>>> The test device was Jerry Chromebook (RK3288) with Logitech Brio 4K.
>>>>> I collected data for 5 seconds. (There were around 480 callbacks in
>>>>> this case.) The following result shows that this patch makes
>>>>> uvc_video_complete about 2x faster.
>>>>>
>>>>>            | average | median  | min     | max     | standard deviation
>>>>> w/o caching| 45319ns | 40250ns | 33834ns | 142625ns| 16611ns
>>>>> w/  caching| 20620ns | 19250ns | 12250ns | 56583ns | 6285ns
>>>>>
>>>>> In addition, we confirmed that this patch doesn't make it worse on
>>>>> coherent DMA architecture by performing the same measurements on a
>>>>> Broadwell Chromebox with the same camera.
>>>>>
>>>>>            | average | median  | min     | max     | standard deviation
>>>>> w/o caching| 21026ns | 21424ns | 12263ns | 23956ns | 1932ns
>>>>> w/  caching| 20728ns | 20398ns |  8922ns | 45120ns | 3368ns
>>>>
>>>> This is very interesting, and it seems related to https://
>>>> patchwork.kernel.org/patch/10468937/. You might have seen that discussion
>>>> as you got CC'ed at some point.
>>>>
>>>> I wonder whether performances couldn't be further improved by allocating
>>>> the URB buffers cached, as that would speed up the memcpy() as well. Have
>>>> you tested that by any chance ?
>>>
>>> We haven't measure it, but the issue being solved here was indeed
>>> significantly reduced by using cached URB buffers, even without
>>> Kieran's async series. After we discovered the latter, we just
>>> backported it and decided to further tweak the last remaining bit, to
>>> avoid playing too much with the DMA API in code used in production on
>>> several different platforms (including both ARM and x86).
>>>
>>> If you think we could change the driver to use cached buffers instead
>>> (as the pwc driver mentioned in another thread), I wouldn't have
>>> anything against it obviously.
>>
>> I think there's a chance that performances could be further improved.
>> Furthermore, it would lean to simpler code as we wouldn't need to deal with
>> caching headers manually. I would however like to see numbers before making a
>> decision.
>>
>> --
>> Regards,
>>
>> Laurent Pinchart
>>
>>
>>

-- 
Regards
--
Kieran

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v1] media: uvcvideo: Cache URB header data before processing
  2018-08-08 13:07         ` Kieran Bingham
@ 2018-08-08 14:02           ` Keiichi Watanabe
  0 siblings, 0 replies; 23+ messages in thread
From: Keiichi Watanabe @ 2018-08-08 14:02 UTC (permalink / raw)
  To: kieran.bingham
  Cc: Laurent Pinchart, Tomasz Figa, Linux Kernel Mailing List,
	Mauro Carvalho Chehab, Linux Media Mailing List,
	Douglas Anderson, Alan Stern, ezequiel, matwey

Hi Kieran,
On Wed, Aug 8, 2018 at 10:07 PM Kieran Bingham
<kieran.bingham@ideasonboard.com> wrote:
>
> Hi All,
>
> On 08/08/18 13:45, Keiichi Watanabe wrote:
> > Hi Laurent, Kieran, Tomasz,
> >
> > Thank you for reviews and suggestions.
> > I want to do additional measurements for improving the performance.
> >
> > Let me clarify my understanding:
> > Currently, if the platform doesn't support coherent-DMA (e.g. ARM),
> > urb_buffer is allocated by usb_alloc_coherent with
> > URB_NO_TRANSFER_DMA_MAP flag instead of using kmalloc.
> > This is because we want to avoid frequent DMA mappings, which are
> > generally expensive.
> > However, memories allocated in this way are not cached.
> >
> > So, we wonder if using usb_alloc_coherent is really fast.
> > In other words, we want to know which is better:
> > "No DMA mapping/Uncached memory" v.s. "Frequent DMA mapping/Cached memory".
> >
> > For this purpose, I'm planning to measure performance on ARM
> > Chromebooks in the following conditions:
> > 1. Current implementation with Kieran's patches
> > 2. 1. + my patch
> > 3. Use kmalloc instead
> >
> > 1 and 2 are the same conditions I reported in the first mail on this thread.
> > For condition 3, I only have to add "#define CONFIG_DMA_NONCOHERENT"
> > at the beginning of uvc_video.c.
>
> I'd be interested in numbers/performances both with and without my async
> if possible too.
>
> The async path can be easily disabled temporarily with the following
> change: (perhaps this should be a module option?)
>
> diff --git a/drivers/media/usb/uvc/uvc_video.c
> b/drivers/media/usb/uvc/uvc_video.c
> index 8bb6e90f3483..f9fbdc9bfa4b 100644
> --- a/drivers/media/usb/uvc/uvc_video.c
> +++ b/drivers/media/usb/uvc/uvc_video.c
> @@ -1505,7 +1505,8 @@ static void uvc_video_complete(struct urb *urb)
>         }
>
>         INIT_WORK(&uvc_urb->work, uvc_video_copy_data_work);
> -       queue_work(stream->async_wq, &uvc_urb->work);
> +//     queue_work(stream->async_wq, &uvc_urb->work);
> +       uvc_video_copy_data_work(&uvc_urb->work);
>  }
>
>  /*
>
>
> I do suspect that even with cached buffers, it's probably likely we
> should still consider the async patches to move the memcopy out of
> interrupt context.
>
It sounds better to check.
I'll do tests with/without your async patches in both cached/uncached
memory allocation conditions.

Best regards,
Keiichi

> --
> Regards
>
> Kieran
>
>
>
>
> >
> > Does this plan sound reasonable?
> >
> > Best regards,
> > Keiichi
> > On Wed, Aug 8, 2018 at 5:42 PM Laurent Pinchart
> > <laurent.pinchart@ideasonboard.com> wrote:
> >>
> >> Hi Tomasz,
> >>
> >> On Wednesday, 8 August 2018 07:08:59 EEST Tomasz Figa wrote:
> >>> On Tue, Jul 31, 2018 at 1:00 AM Laurent Pinchart wrote:
> >>>> On Wednesday, 27 June 2018 13:34:08 EEST Keiichi Watanabe wrote:
> >>>>> On some platforms with non-coherent DMA (e.g. ARM), USB drivers use
> >>>>> uncached memory allocation methods. In such situations, it sometimes
> >>>>> takes a long time to access URB buffers.  This can be a cause of video
> >>>>> flickering problems if a resolution is high and a USB controller has
> >>>>> a very tight time limit. (e.g. dwc2) To avoid this problem, we copy
> >>>>> header data from (uncached) URB buffer into (cached) local buffer.
> >>>>>
> >>>>> This change should make the elapsed time of the interrupt handler
> >>>>> shorter on platforms with non-coherent DMA. We measured the elapsed
> >>>>> time of each callback of uvc_video_complete without/with this patch
> >>>>> while capturing Full HD video in
> >>>>> https://webrtc.github.io/samples/src/content/getusermedia/resolution/.
> >>>>> I tested it on the top of Kieran Bingham's Asynchronous UVC series
> >>>>> https://www.mail-archive.com/linux-media@vger.kernel.org/msg128359.html.
> >>>>> The test device was Jerry Chromebook (RK3288) with Logitech Brio 4K.
> >>>>> I collected data for 5 seconds. (There were around 480 callbacks in
> >>>>> this case.) The following result shows that this patch makes
> >>>>> uvc_video_complete about 2x faster.
> >>>>>
> >>>>>            | average | median  | min     | max     | standard deviation
> >>>>> w/o caching| 45319ns | 40250ns | 33834ns | 142625ns| 16611ns
> >>>>> w/  caching| 20620ns | 19250ns | 12250ns | 56583ns | 6285ns
> >>>>>
> >>>>> In addition, we confirmed that this patch doesn't make it worse on
> >>>>> coherent DMA architecture by performing the same measurements on a
> >>>>> Broadwell Chromebox with the same camera.
> >>>>>
> >>>>>            | average | median  | min     | max     | standard deviation
> >>>>> w/o caching| 21026ns | 21424ns | 12263ns | 23956ns | 1932ns
> >>>>> w/  caching| 20728ns | 20398ns |  8922ns | 45120ns | 3368ns
> >>>>
> >>>> This is very interesting, and it seems related to https://
> >>>> patchwork.kernel.org/patch/10468937/. You might have seen that discussion
> >>>> as you got CC'ed at some point.
> >>>>
> >>>> I wonder whether performances couldn't be further improved by allocating
> >>>> the URB buffers cached, as that would speed up the memcpy() as well. Have
> >>>> you tested that by any chance ?
> >>>
> >>> We haven't measure it, but the issue being solved here was indeed
> >>> significantly reduced by using cached URB buffers, even without
> >>> Kieran's async series. After we discovered the latter, we just
> >>> backported it and decided to further tweak the last remaining bit, to
> >>> avoid playing too much with the DMA API in code used in production on
> >>> several different platforms (including both ARM and x86).
> >>>
> >>> If you think we could change the driver to use cached buffers instead
> >>> (as the pwc driver mentioned in another thread), I wouldn't have
> >>> anything against it obviously.
> >>
> >> I think there's a chance that performances could be further improved.
> >> Furthermore, it would lean to simpler code as we wouldn't need to deal with
> >> caching headers manually. I would however like to see numbers before making a
> >> decision.
> >>
> >> --
> >> Regards,
> >>
> >> Laurent Pinchart
> >>
> >>
> >>
>
> --
> Regards
> --
> Kieran

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v1] media: uvcvideo: Cache URB header data before processing
  2018-08-08 12:45       ` Keiichi Watanabe
  2018-08-08 13:07         ` Kieran Bingham
@ 2018-08-08 14:20         ` Alan Stern
  2018-08-08 16:22           ` Laurent Pinchart
  1 sibling, 1 reply; 23+ messages in thread
From: Alan Stern @ 2018-08-08 14:20 UTC (permalink / raw)
  To: Keiichi Watanabe
  Cc: Laurent Pinchart, Tomasz Figa, Linux Kernel Mailing List,
	Mauro Carvalho Chehab, Linux Media Mailing List, kieran.bingham,
	Douglas Anderson, ezequiel, matwey

On Wed, 8 Aug 2018, Keiichi Watanabe wrote:

> Hi Laurent, Kieran, Tomasz,
> 
> Thank you for reviews and suggestions.
> I want to do additional measurements for improving the performance.
> 
> Let me clarify my understanding:
> Currently, if the platform doesn't support coherent-DMA (e.g. ARM),
> urb_buffer is allocated by usb_alloc_coherent with
> URB_NO_TRANSFER_DMA_MAP flag instead of using kmalloc.

Not exactly.  You are mixing up allocation with mapping.  The speed of 
the allocation doesn't matter; all that matters is whether the memory 
is cached and when it gets mapped/unmapped.

> This is because we want to avoid frequent DMA mappings, which are
> generally expensive.
> However, memories allocated in this way are not cached.
> 
> So, we wonder if using usb_alloc_coherent is really fast.
> In other words, we want to know which is better:
> "No DMA mapping/Uncached memory" v.s. "Frequent DMA mapping/Cached memory".

There is no need to wonder.  "Frequent DMA mapping/Cached memory" is 
always faster than "No DMA mapping/Uncached memory".

The only issue is that on some platform (such as x86) but not others,
there is a third option: "No DMA mapping/Cached memory".  On platforms 
which support it, this is the fastest option.

Alan Stern


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v1] media: uvcvideo: Cache URB header data before processing
  2018-08-08 14:20         ` Alan Stern
@ 2018-08-08 16:22           ` Laurent Pinchart
  2018-08-08 16:27             ` Tomasz Figa
                               ` (2 more replies)
  0 siblings, 3 replies; 23+ messages in thread
From: Laurent Pinchart @ 2018-08-08 16:22 UTC (permalink / raw)
  To: Alan Stern
  Cc: Keiichi Watanabe, Tomasz Figa, Linux Kernel Mailing List,
	Mauro Carvalho Chehab, Linux Media Mailing List, kieran.bingham,
	Douglas Anderson, ezequiel, matwey

Hello,

On Wednesday, 8 August 2018 17:20:21 EEST Alan Stern wrote:
> On Wed, 8 Aug 2018, Keiichi Watanabe wrote:
> > Hi Laurent, Kieran, Tomasz,
> > 
> > Thank you for reviews and suggestions.
> > I want to do additional measurements for improving the performance.
> > 
> > Let me clarify my understanding:
> > Currently, if the platform doesn't support coherent-DMA (e.g. ARM),
> > urb_buffer is allocated by usb_alloc_coherent with
> > URB_NO_TRANSFER_DMA_MAP flag instead of using kmalloc.
> 
> Not exactly.  You are mixing up allocation with mapping.  The speed of
> the allocation doesn't matter; all that matters is whether the memory
> is cached and when it gets mapped/unmapped.
> 
> > This is because we want to avoid frequent DMA mappings, which are
> > generally expensive. However, memories allocated in this way are not
> > cached.
> > 
> > So, we wonder if using usb_alloc_coherent is really fast.
> > In other words, we want to know which is better:
> > "No DMA mapping/Uncached memory" v.s. "Frequent DMA mapping/Cached
> > memory".

The second option should also be split in two:

- cached memory with DMA mapping/unmapping around each transfer
- cached memory with DMA mapping/unmapping at allocation/free time, and DMA 
sync around each transfer

The second option should in theory lead to at least slightly better 
performances, but tests with the pwc driver have reported contradictory 
results. I'd like to know whether that's also the case with the uvcvideo 
driver, and if so, why.

> There is no need to wonder.  "Frequent DMA mapping/Cached memory" is
> always faster than "No DMA mapping/Uncached memory".

Is it really, doesn't it depend on the CPU access pattern ?

> The only issue is that on some platform (such as x86) but not others,
> there is a third option: "No DMA mapping/Cached memory".  On platforms
> which support it, this is the fastest option.

-- 
Regards,

Laurent Pinchart




^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v1] media: uvcvideo: Cache URB header data before processing
  2018-08-08 16:22           ` Laurent Pinchart
@ 2018-08-08 16:27             ` Tomasz Figa
  2018-08-08 22:33               ` Laurent Pinchart
  2018-08-08 16:29             ` Ezequiel Garcia
  2018-08-08 17:04             ` Alan Stern
  2 siblings, 1 reply; 23+ messages in thread
From: Tomasz Figa @ 2018-08-08 16:27 UTC (permalink / raw)
  To: Laurent Pinchart
  Cc: Alan Stern, keiichiw, Linux Kernel Mailing List,
	Mauro Carvalho Chehab, Linux Media Mailing List, Kieran Bingham,
	Douglas Anderson, Ezequiel Garcia, Matwey V. Kornilov

On Thu, Aug 9, 2018 at 1:21 AM Laurent Pinchart
<laurent.pinchart@ideasonboard.com> wrote:
>
> Hello,
>
> On Wednesday, 8 August 2018 17:20:21 EEST Alan Stern wrote:
> > On Wed, 8 Aug 2018, Keiichi Watanabe wrote:
> > > Hi Laurent, Kieran, Tomasz,
> > >
> > > Thank you for reviews and suggestions.
> > > I want to do additional measurements for improving the performance.
> > >
> > > Let me clarify my understanding:
> > > Currently, if the platform doesn't support coherent-DMA (e.g. ARM),
> > > urb_buffer is allocated by usb_alloc_coherent with
> > > URB_NO_TRANSFER_DMA_MAP flag instead of using kmalloc.
> >
> > Not exactly.  You are mixing up allocation with mapping.  The speed of
> > the allocation doesn't matter; all that matters is whether the memory
> > is cached and when it gets mapped/unmapped.
> >
> > > This is because we want to avoid frequent DMA mappings, which are
> > > generally expensive. However, memories allocated in this way are not
> > > cached.
> > >
> > > So, we wonder if using usb_alloc_coherent is really fast.
> > > In other words, we want to know which is better:
> > > "No DMA mapping/Uncached memory" v.s. "Frequent DMA mapping/Cached
> > > memory".
>
> The second option should also be split in two:
>
> - cached memory with DMA mapping/unmapping around each transfer
> - cached memory with DMA mapping/unmapping at allocation/free time, and DMA
> sync around each transfer
>
> The second option should in theory lead to at least slightly better
> performances, but tests with the pwc driver have reported contradictory
> results. I'd like to know whether that's also the case with the uvcvideo
> driver, and if so, why.

I thought that the results from retesting on pwc, after making sure
that cpu frequency stays the same all the time, actually clarified
this and indeed map once, sync repeatedly was the fastest?

Best regards,
Tomasz

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v1] media: uvcvideo: Cache URB header data before processing
  2018-08-08 16:22           ` Laurent Pinchart
  2018-08-08 16:27             ` Tomasz Figa
@ 2018-08-08 16:29             ` Ezequiel Garcia
  2018-08-08 22:35               ` Laurent Pinchart
  2018-08-08 17:04             ` Alan Stern
  2 siblings, 1 reply; 23+ messages in thread
From: Ezequiel Garcia @ 2018-08-08 16:29 UTC (permalink / raw)
  To: Laurent Pinchart
  Cc: Alan Stern, Keiichi Watanabe, Tomasz Figa,
	Linux Kernel Mailing List, Mauro Carvalho Chehab,
	Linux Media Mailing List, kieran.bingham, Douglas Anderson,
	Ezequiel Garcia, matwey

On 8 August 2018 at 13:22, Laurent Pinchart
<laurent.pinchart@ideasonboard.com> wrote:
> Hello,
>
> On Wednesday, 8 August 2018 17:20:21 EEST Alan Stern wrote:
>> On Wed, 8 Aug 2018, Keiichi Watanabe wrote:
>> > Hi Laurent, Kieran, Tomasz,
>> >
>> > Thank you for reviews and suggestions.
>> > I want to do additional measurements for improving the performance.
>> >
>> > Let me clarify my understanding:
>> > Currently, if the platform doesn't support coherent-DMA (e.g. ARM),
>> > urb_buffer is allocated by usb_alloc_coherent with
>> > URB_NO_TRANSFER_DMA_MAP flag instead of using kmalloc.
>>
>> Not exactly.  You are mixing up allocation with mapping.  The speed of
>> the allocation doesn't matter; all that matters is whether the memory
>> is cached and when it gets mapped/unmapped.
>>
>> > This is because we want to avoid frequent DMA mappings, which are
>> > generally expensive. However, memories allocated in this way are not
>> > cached.
>> >
>> > So, we wonder if using usb_alloc_coherent is really fast.
>> > In other words, we want to know which is better:
>> > "No DMA mapping/Uncached memory" v.s. "Frequent DMA mapping/Cached
>> > memory".
>
> The second option should also be split in two:
>
> - cached memory with DMA mapping/unmapping around each transfer
> - cached memory with DMA mapping/unmapping at allocation/free time, and DMA
> sync around each transfer
>

I agree with this, the second one should be better.

I still wonder if there is anyway we can create a helper for this,
as I am under the impression most USB video4linux drivers
will want to implement the same.

> The second option should in theory lead to at least slightly better
> performances, but tests with the pwc driver have reported contradictory
> results. I'd like to know whether that's also the case with the uvcvideo
> driver, and if so, why.
>

I believe that is no longer the case. Matwey measured again and the results
are what we expected: a single mapping, and sync in the interrupt handler
is a little bit faster. See https://lkml.org/lkml/2018/8/4/44

2) dma_unmap and dma_map in the handler:
2A) dma_unmap_single call: 28.8 +- 1.5 usec
2B) memcpy and the rest: 58 +- 6 usec
2C) dma_map_single call: 22 +- 2 usec
Total: 110 +- 7 usec

3) dma_sync_single_for_cpu
3A) dma_sync_single_for_cpu call: 29.4 +- 1.7 usec
3B) memcpy and the rest: 59 +- 6 usec
3C) noop (trace events overhead): 5 +- 2 usec
Total: 93 +- 7 usec

-- 
Ezequiel García, VanguardiaSur
www.vanguardiasur.com.ar

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v1] media: uvcvideo: Cache URB header data before processing
  2018-08-08 16:22           ` Laurent Pinchart
  2018-08-08 16:27             ` Tomasz Figa
  2018-08-08 16:29             ` Ezequiel Garcia
@ 2018-08-08 17:04             ` Alan Stern
  2018-08-08 22:20               ` Laurent Pinchart
  2 siblings, 1 reply; 23+ messages in thread
From: Alan Stern @ 2018-08-08 17:04 UTC (permalink / raw)
  To: Laurent Pinchart
  Cc: Keiichi Watanabe, Tomasz Figa, Linux Kernel Mailing List,
	Mauro Carvalho Chehab, Linux Media Mailing List, kieran.bingham,
	Douglas Anderson, ezequiel, matwey

On Wed, 8 Aug 2018, Laurent Pinchart wrote:

> Hello,
> 
> On Wednesday, 8 August 2018 17:20:21 EEST Alan Stern wrote:
> > On Wed, 8 Aug 2018, Keiichi Watanabe wrote:
> > > Hi Laurent, Kieran, Tomasz,
> > > 
> > > Thank you for reviews and suggestions.
> > > I want to do additional measurements for improving the performance.
> > > 
> > > Let me clarify my understanding:
> > > Currently, if the platform doesn't support coherent-DMA (e.g. ARM),
> > > urb_buffer is allocated by usb_alloc_coherent with
> > > URB_NO_TRANSFER_DMA_MAP flag instead of using kmalloc.
> > 
> > Not exactly.  You are mixing up allocation with mapping.  The speed of
> > the allocation doesn't matter; all that matters is whether the memory
> > is cached and when it gets mapped/unmapped.
> > 
> > > This is because we want to avoid frequent DMA mappings, which are
> > > generally expensive. However, memories allocated in this way are not
> > > cached.
> > > 
> > > So, we wonder if using usb_alloc_coherent is really fast.
> > > In other words, we want to know which is better:
> > > "No DMA mapping/Uncached memory" v.s. "Frequent DMA mapping/Cached
> > > memory".
> 
> The second option should also be split in two:
> 
> - cached memory with DMA mapping/unmapping around each transfer
> - cached memory with DMA mapping/unmapping at allocation/free time, and DMA 
> sync around each transfer
> 
> The second option should in theory lead to at least slightly better 
> performances, but tests with the pwc driver have reported contradictory 
> results. I'd like to know whether that's also the case with the uvcvideo 
> driver, and if so, why.
> 
> > There is no need to wonder.  "Frequent DMA mapping/Cached memory" is
> > always faster than "No DMA mapping/Uncached memory".
> 
> Is it really, doesn't it depend on the CPU access pattern ?

Well, if your access pattern involves transferring data in from the
device and then throwing it away without reading it, you might get a
different result.  :-)  But assuming you plan to read the data after
transferring it, using uncached memory slows things down so much that
the overhead of DMA mapping/unmapping is negligible by comparison.

The only exception might be if you were talking about very small
amounts of data.  I don't know exactly where the crossover occurs, but
bear in mind that Matwey's tests required ~50 us for mapping/unmapping
and 3000 us for accessing uncached memory.  He didn't say how large the
transfers were, but that's still a pretty big difference.

Alan Stern

> > The only issue is that on some platform (such as x86) but not others,
> > there is a third option: "No DMA mapping/Cached memory".  On platforms
> > which support it, this is the fastest option.


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v1] media: uvcvideo: Cache URB header data before processing
  2018-08-08 17:04             ` Alan Stern
@ 2018-08-08 22:20               ` Laurent Pinchart
  2018-08-09 14:12                 ` Alan Stern
  0 siblings, 1 reply; 23+ messages in thread
From: Laurent Pinchart @ 2018-08-08 22:20 UTC (permalink / raw)
  To: Alan Stern
  Cc: Keiichi Watanabe, Tomasz Figa, Linux Kernel Mailing List,
	Mauro Carvalho Chehab, Linux Media Mailing List, kieran.bingham,
	Douglas Anderson, ezequiel, matwey

Hi Alan,

On Wednesday, 8 August 2018 20:04:45 EEST Alan Stern wrote:
> On Wed, 8 Aug 2018, Laurent Pinchart wrote:
> > On Wednesday, 8 August 2018 17:20:21 EEST Alan Stern wrote:
> >> On Wed, 8 Aug 2018, Keiichi Watanabe wrote:
> >>> Hi Laurent, Kieran, Tomasz,
> >>> 
> >>> Thank you for reviews and suggestions.
> >>> I want to do additional measurements for improving the performance.
> >>> 
> >>> Let me clarify my understanding:
> >>> Currently, if the platform doesn't support coherent-DMA (e.g. ARM),
> >>> urb_buffer is allocated by usb_alloc_coherent with
> >>> URB_NO_TRANSFER_DMA_MAP flag instead of using kmalloc.
> >> 
> >> Not exactly.  You are mixing up allocation with mapping.  The speed of
> >> the allocation doesn't matter; all that matters is whether the memory
> >> is cached and when it gets mapped/unmapped.
> >> 
> >>> This is because we want to avoid frequent DMA mappings, which are
> >>> generally expensive. However, memories allocated in this way are not
> >>> cached.
> >>> 
> >>> So, we wonder if using usb_alloc_coherent is really fast.
> >>> In other words, we want to know which is better:
> >>> "No DMA mapping/Uncached memory" v.s. "Frequent DMA mapping/Cached
> >>> memory".
> > 
> > The second option should also be split in two:
> > 
> > - cached memory with DMA mapping/unmapping around each transfer
> > - cached memory with DMA mapping/unmapping at allocation/free time, and
> > DMA sync around each transfer
> > 
> > The second option should in theory lead to at least slightly better
> > performances, but tests with the pwc driver have reported contradictory
> > results. I'd like to know whether that's also the case with the uvcvideo
> > driver, and if so, why.
> > 
> >> There is no need to wonder.  "Frequent DMA mapping/Cached memory" is
> >> always faster than "No DMA mapping/Uncached memory".
> > 
> > Is it really, doesn't it depend on the CPU access pattern ?
> 
> Well, if your access pattern involves transferring data in from the
> device and then throwing it away without reading it, you might get a
> different result.  :-)  But assuming you plan to read the data after
> transferring it, using uncached memory slows things down so much that
> the overhead of DMA mapping/unmapping is negligible by comparison.

:-) I suppose it would also depend on the access pattern, if I only need to 
access part of the buffer, performance figures may vary. In this case however 
the whole buffer needs to be copied.

> The only exception might be if you were talking about very small
> amounts of data.  I don't know exactly where the crossover occurs, but
> bear in mind that Matwey's tests required ~50 us for mapping/unmapping
> and 3000 us for accessing uncached memory.  He didn't say how large the
> transfers were, but that's still a pretty big difference.

For UVC devices using bulk endpoints data buffers are typically tens of kBs. 
For devices using isochronous endpoints, that goes down to possibly hundreds 
of bytes for some buffers. Devices can send less data than the maximum packet 
size, and mapping/unmapping would still invalidate the cache for the whole 
buffer. If we keep the mappings around and use the DMA sync API, we could 
possibly restrict the cache invalidation to the portion of the buffer actually 
written to.

> >> The only issue is that on some platform (such as x86) but not others,
> >> there is a third option: "No DMA mapping/Cached memory".  On platforms
> >> which support it, this is the fastest option.

-- 
Regards,

Laurent Pinchart




^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v1] media: uvcvideo: Cache URB header data before processing
  2018-08-08 16:27             ` Tomasz Figa
@ 2018-08-08 22:33               ` Laurent Pinchart
  0 siblings, 0 replies; 23+ messages in thread
From: Laurent Pinchart @ 2018-08-08 22:33 UTC (permalink / raw)
  To: Tomasz Figa
  Cc: Alan Stern, keiichiw, Linux Kernel Mailing List,
	Mauro Carvalho Chehab, Linux Media Mailing List, Kieran Bingham,
	Douglas Anderson, Ezequiel Garcia, Matwey V. Kornilov

Hi Tomasz,

On Wednesday, 8 August 2018 19:27:42 EEST Tomasz Figa wrote:
> On Thu, Aug 9, 2018 at 1:21 AM Laurent Pinchart wrote:
> > On Wednesday, 8 August 2018 17:20:21 EEST Alan Stern wrote:
> >> On Wed, 8 Aug 2018, Keiichi Watanabe wrote:
> >>> Hi Laurent, Kieran, Tomasz,
> >>> 
> >>> Thank you for reviews and suggestions.
> >>> I want to do additional measurements for improving the performance.
> >>> 
> >>> Let me clarify my understanding:
> >>> Currently, if the platform doesn't support coherent-DMA (e.g. ARM),
> >>> urb_buffer is allocated by usb_alloc_coherent with
> >>> URB_NO_TRANSFER_DMA_MAP flag instead of using kmalloc.
> >> 
> >> Not exactly.  You are mixing up allocation with mapping.  The speed of
> >> the allocation doesn't matter; all that matters is whether the memory
> >> is cached and when it gets mapped/unmapped.
> >> 
> >>> This is because we want to avoid frequent DMA mappings, which are
> >>> generally expensive. However, memories allocated in this way are not
> >>> cached.
> >>> 
> >>> So, we wonder if using usb_alloc_coherent is really fast.
> >>> In other words, we want to know which is better:
> >>> "No DMA mapping/Uncached memory" v.s. "Frequent DMA mapping/Cached
> >>> memory".
> > 
> > The second option should also be split in two:
> > 
> > - cached memory with DMA mapping/unmapping around each transfer
> > - cached memory with DMA mapping/unmapping at allocation/free time, and
> > DMA sync around each transfer
> > 
> > The second option should in theory lead to at least slightly better
> > performances, but tests with the pwc driver have reported contradictory
> > results. I'd like to know whether that's also the case with the uvcvideo
> > driver, and if so, why.
> 
> I thought that the results from retesting on pwc, after making sure
> that cpu frequency stays the same all the time, actually clarified
> this and indeed map once, sync repeatedly was the fastest?

I had missed that, thank you for the information. I've now caught up with the 
pwc thread.

-- 
Regards,

Laurent Pinchart




^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v1] media: uvcvideo: Cache URB header data before processing
  2018-08-08 16:29             ` Ezequiel Garcia
@ 2018-08-08 22:35               ` Laurent Pinchart
  0 siblings, 0 replies; 23+ messages in thread
From: Laurent Pinchart @ 2018-08-08 22:35 UTC (permalink / raw)
  To: Ezequiel Garcia
  Cc: Alan Stern, Keiichi Watanabe, Tomasz Figa,
	Linux Kernel Mailing List, Mauro Carvalho Chehab,
	Linux Media Mailing List, kieran.bingham, Douglas Anderson,
	Ezequiel Garcia, matwey

Hi Ezequiel,

On Wednesday, 8 August 2018 19:29:56 EEST Ezequiel Garcia wrote:
> On 8 August 2018 at 13:22, Laurent Pinchart wrote:
> > On Wednesday, 8 August 2018 17:20:21 EEST Alan Stern wrote:
> >> On Wed, 8 Aug 2018, Keiichi Watanabe wrote:
> >>> Hi Laurent, Kieran, Tomasz,
> >>> 
> >>> Thank you for reviews and suggestions.
> >>> I want to do additional measurements for improving the performance.
> >>> 
> >>> Let me clarify my understanding:
> >>> Currently, if the platform doesn't support coherent-DMA (e.g. ARM),
> >>> urb_buffer is allocated by usb_alloc_coherent with
> >>> URB_NO_TRANSFER_DMA_MAP flag instead of using kmalloc.
> >> 
> >> Not exactly.  You are mixing up allocation with mapping.  The speed of
> >> the allocation doesn't matter; all that matters is whether the memory
> >> is cached and when it gets mapped/unmapped.
> >> 
> >>> This is because we want to avoid frequent DMA mappings, which are
> >>> generally expensive. However, memories allocated in this way are not
> >>> cached.
> >>> 
> >>> So, we wonder if using usb_alloc_coherent is really fast.
> >>> In other words, we want to know which is better:
> >>> "No DMA mapping/Uncached memory" v.s. "Frequent DMA mapping/Cached
> >>> memory".
> > 
> > The second option should also be split in two:
> > 
> > - cached memory with DMA mapping/unmapping around each transfer
> > - cached memory with DMA mapping/unmapping at allocation/free time, and
> > DMA sync around each transfer
> 
> I agree with this, the second one should be better.
> 
> I still wonder if there is anyway we can create a helper for this,
> as I am under the impression most USB video4linux drivers
> will want to implement the same.

I agree with you, drivers shouldn't care.

> > The second option should in theory lead to at least slightly better
> > performances, but tests with the pwc driver have reported contradictory
> > results. I'd like to know whether that's also the case with the uvcvideo
> > driver, and if so, why.
> 
> I believe that is no longer the case. Matwey measured again and the results
> are what we expected: a single mapping, and sync in the interrupt handler
> is a little bit faster. See https://lkml.org/lkml/2018/8/4/44
> 
> 2) dma_unmap and dma_map in the handler:
> 2A) dma_unmap_single call: 28.8 +- 1.5 usec
> 2B) memcpy and the rest: 58 +- 6 usec
> 2C) dma_map_single call: 22 +- 2 usec
> Total: 110 +- 7 usec
> 
> 3) dma_sync_single_for_cpu
> 3A) dma_sync_single_for_cpu call: 29.4 +- 1.7 usec
> 3B) memcpy and the rest: 59 +- 6 usec
> 3C) noop (trace events overhead): 5 +- 2 usec
> Total: 93 +- 7 usec

I hadn't caught up with the pwc e-mail thread, I now have, and I'm happy to 
see that everything is now properly understood. Thanks again Matwey for your 
work.

-- 
Regards,

Laurent Pinchart




^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v1] media: uvcvideo: Cache URB header data before processing
  2018-08-08 22:20               ` Laurent Pinchart
@ 2018-08-09 14:12                 ` Alan Stern
  2018-08-24  6:06                   ` Keiichi Watanabe
  0 siblings, 1 reply; 23+ messages in thread
From: Alan Stern @ 2018-08-09 14:12 UTC (permalink / raw)
  To: Laurent Pinchart
  Cc: Keiichi Watanabe, Tomasz Figa, Linux Kernel Mailing List,
	Mauro Carvalho Chehab, Linux Media Mailing List, kieran.bingham,
	Douglas Anderson, ezequiel, matwey

On Thu, 9 Aug 2018, Laurent Pinchart wrote:

> > >> There is no need to wonder.  "Frequent DMA mapping/Cached memory" is
> > >> always faster than "No DMA mapping/Uncached memory".
> > > 
> > > Is it really, doesn't it depend on the CPU access pattern ?
> > 
> > Well, if your access pattern involves transferring data in from the
> > device and then throwing it away without reading it, you might get a
> > different result.  :-)  But assuming you plan to read the data after
> > transferring it, using uncached memory slows things down so much that
> > the overhead of DMA mapping/unmapping is negligible by comparison.
> 
> :-) I suppose it would also depend on the access pattern, if I only need to 
> access part of the buffer, performance figures may vary. In this case however 
> the whole buffer needs to be copied.
> 
> > The only exception might be if you were talking about very small
> > amounts of data.  I don't know exactly where the crossover occurs, but
> > bear in mind that Matwey's tests required ~50 us for mapping/unmapping
> > and 3000 us for accessing uncached memory.  He didn't say how large the
> > transfers were, but that's still a pretty big difference.
> 
> For UVC devices using bulk endpoints data buffers are typically tens of kBs. 
> For devices using isochronous endpoints, that goes down to possibly hundreds 
> of bytes for some buffers. Devices can send less data than the maximum packet 
> size, and mapping/unmapping would still invalidate the cache for the whole 
> buffer. If we keep the mappings around and use the DMA sync API, we could 
> possibly restrict the cache invalidation to the portion of the buffer actually 
> written to.

Furthermore, invalidating a cache is likely to require less overhead
than using non-cacheable memory.  After the cache has been invalidated,
it can be repopulated relatively quickly (an entire cache line at a
time), whereas reading uncached memory requires a slow transaction for
each individual read operation.

I think adding support to the USB core for 
dma_sync_single_for_{cpu|device} would be a good approach.  In fact, I 
wonder whether using coherent mappings provides any benefit at all.

Alan Stern


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v1] media: uvcvideo: Cache URB header data before processing
  2018-08-09 14:12                 ` Alan Stern
@ 2018-08-24  6:06                   ` Keiichi Watanabe
  2018-08-24 22:00                     ` Kieran Bingham
  0 siblings, 1 reply; 23+ messages in thread
From: Keiichi Watanabe @ 2018-08-24  6:06 UTC (permalink / raw)
  To: Alan Stern
  Cc: Laurent Pinchart, Tomasz Figa, Linux Kernel Mailing List,
	Mauro Carvalho Chehab, Linux Media Mailing List, kieran.bingham,
	Douglas Anderson, ezequiel, Matwey V. Kornilov

Hi all.

We performed two types of experiments.

In the first experiment, we compared the performance of uvcvideo by
changing a way of memory allocation, usb_alloc_coherent and kmalloc.
At the same time, we changed conditions by enabling/disabling
asynchronous memory copy suggested by Kieran in [1].

The second experiment is a comparison between dma_unmap/map and dma_sync.
Here, DMA mapping is done manually in uvc handlers. This is similar to
Matwey's patch for pwc.
https://patchwork.kernel.org/patch/10468937/.

Raw data are pasted after descriptions of the experiments.

# Settings
The test device was Jerry Chromebook (RK3288) with Logitech Brio 4K.
We did video capturing at
https://webrtc.github.io/samples/src/content/getusermedia/resolution/
with Full HD resolution in about 30 seconds for each condition.

For logging statistics, I used Kieran's patch [2].

## Exp. 1
Here, we have two parameters, way of memory allocation and
enabling/disabling async memcopy.
So, there were 4 combinations:

A. No Async + usb_alloc_coherent (with my patch caching header data)
   Patch: [3] + [4]
   Since Kieran's async patches are already merged into ChromeOS's
kernel, we disabled it by [3].
   Patch [4] is an updated version of my patch.

B. No Async + kmalloc
   Patch: [3] + [5]
   [5] just adds '#define CONFIG_DMA_NONCOHERENT' at the beginning of
uvc_video.c to use kmalloc.

C. Async + usb_alloc_coherent (with my patch caching header data)
   Patch: [4]

D. Async + kmalloc
   Patch: [5]

## Exp. 2
The conditions of the second experiment are based on condition D,
where URB buffers are allocated by kmalloc and Kieran's asynchronous
patches are enabled.

E. Async + kmalloc + manually unmap/map for each packet
   Patch: [6]
   URB_NO_TRANSFER_DMA_MAP flag is used here. dma_map and dma_unmap
are explicitly called manually in uvc_video.c.

F. Async + kmalloc + manually sync for each packet
   Patch: [7]
   In uvc_video_complete, dma_single_for_cpu is called instead of
dma_unmap_single and dma_map_single.

Note that the elapsed times for E and F cannot be compared with those
for D in a simple way.
This is because we don't measure elapsed time of functions outside of
uvcvideo.c by [2].
For example, while DMA-unmapping for each packet is done before
uvc_video_complete is called at the condition D,
it's done in uvc_video_complete at the condition E.

# References for patches
[1] Asynchronous UVC
https://www.mail-archive.com/linux-media@vger.kernel.org/msg128359.html

[2] Kieran's patch for measuring the performance of uvcvideo.
https://git.kernel.org/pub/scm/linux/kernel/git/kbingham/rcar.git/commit/?h=uvc/async-ml&id=cebbd1b629bbe5f856ec5dc7591478c003f5a944
I used the modified version of it for ChromeOS, but almost same.
http://crrev.com/c/1184597
The main difference is that our patch uses do_div instead of / and %.

[3] Disable asynchronous decoding
http://crrev.com/c/1184658

[4] Cache URB header data before processing
http://crrev.com/c/1179554
This is an updated version of my patch I sent at the begging of this thread.
I applied Kieran's review comments.

[5] Use kmalloc for urb buffer
http://crrev.com/c/1184643

[6] Manually DMA dma_unmap/map for each packet
http://crrev.com/c/1186293

[7] Manually DMA sync for each packet
http://crrev.com/c/1186214

# Results

For the meanings of each value, please see Kieran's patch:
https://git.kernel.org/pub/scm/linux/kernel/git/kbingham/rcar.git/commit/?h=uvc/async-ml&id=cebbd1b629bbe5f856ec5dc7591478c003f5a944

## Exp. 1

A. No Async + usb_alloc_coherent (with my patch caching header data)
frames:  1121
packets: 233471
empty:   44729 (19 %)
errors:  34801
invalid: 8017
pts: 1121 early, 986 initial, 1121 ok
scr: 1121 count ok, 111 diff ok
sof: 0 <= sof <= 0, freq 0.000 kHz
bytes 135668717 : duration 32907
FPS: 34.06
URB: 427489/3048 uS/qty: 140.252 avg 2.625 min 660.625 max (uS)
header: 440868/3048 uS/qty: 144.641 avg 0.000 min 674.625 max (uS)
latency: 30703/3048 uS/qty: 10.073 avg 0.875 min 26.541 max (uS)
decode: 396785/3048 uS/qty: 130.179 avg 0.875 min 634.375 max (uS)
raw decode speed: 2.740 Gbits/s
raw URB handling speed: 2.541 Gbits/s
throughput: 32.982 Mbits/s
URB decode CPU usage 1.205779 %

---

B. No Async memcpy + kmalloc
frames:  949
packets: 243665
empty:   47804 (19 %)
errors:  2406
invalid: 1058
pts: 949 early, 927 initial, 860 ok
scr: 949 count ok, 17 diff ok
sof: 0 <= sof <= 0, freq 0.000 kHz
bytes 145563878 : duration 30939
FPS: 30.67
URB: 107265/2448 uS/qty: 43.817 avg 3.791 min 212.042 max (uS)
header: 192608/2448 uS/qty: 78.679 avg 0.000 min 471.625 max (uS)
latency: 24860/2448 uS/qty: 10.155 avg 1.750 min 28.000 max (uS)
decode: 82405/2448 uS/qty: 33.662 avg 1.750 min 186.084 max (uS)
raw decode speed: 14.201 Gbits/s
raw URB handling speed: 10.883 Gbits/s
throughput: 37.638 Mbits/s
URB decode CPU usage 0.266349 %

---

C. Async + usb_alloc_coherent (with my patch caching header data)
frames:  874
packets: 232786
empty:   45594 (19 %)
errors:  46
invalid: 48
pts: 874 early, 873 initial, 874 ok
scr: 874 count ok, 1 diff ok
sof: 0 <= sof <= 0, freq 0.000 kHz
bytes 137989497 : duration 29139
FPS: 29.99
URB: 577349/2301 uS/qty: 250.912 avg 24.208 min 1009.459 max (uS)
header: 50334/2301 uS/qty: 21.874 avg 0.000 min 77.583 max (uS)
latency: 77597/2301 uS/qty: 33.723 avg 15.458 min 172.375 max (uS)
decode: 499751/2301 uS/qty: 217.188 avg 4.084 min 978.542 max (uS)
raw decode speed: 2.212 Gbits/s
raw URB handling speed: 1.913 Gbits/s
throughput: 37.884 Mbits/s
URB decode CPU usage 1.715060 %

---

D. Async memcpy + kmalloc
frames:  870
packets: 231152
empty:   45390 (19 %)
errors:  171
invalid: 160
pts: 870 early, 870 initial, 810 ok
scr: 870 count ok, 0 diff ok
sof: 0 <= sof <= 0, freq 0.000 kHz
bytes 137406842 : duration 29036
FPS: 29.96
URB: 160821/2258 uS/qty: 71.222 avg 15.750 min 985.542 max (uS)
header: 40369/2258 uS/qty: 17.878 avg 0.000 min 56.292 max (uS)
latency: 72411/2258 uS/qty: 32.068 avg 10.792 min 946.459 max (uS)
decode: 88410/2258 uS/qty: 39.154 avg 1.458 min 246.167 max (uS)
raw decode speed: 12.491 Gbits/s
raw URB handling speed: 6.870 Gbits/s
throughput: 37.858 Mbits/s
URB decode CPU usage 0.304485 %

----------------------------------------
## Exp. 2

E. Async + kmalloc + manually dma_unmap/map for each packet
frames:  928
packets: 247476
empty:   34060 (13 %)
errors:  16
invalid: 32
pts: 928 early, 928 initial, 163 ok
scr: 928 count ok, 0 diff ok
sof: 0 <= sof <= 0, freq 0.000 kHz
bytes 103315132 : duration 30949
FPS: 29.98
URB: 169873/1876 uS/qty: 90.551 avg 43.750 min 289.917 max (uS)
header: 88962/1876 uS/qty: 47.421 avg 0.000 min 113.750 max (uS)
latency: 109539/1876 uS/qty: 58.389 avg 37.042 min 253.459 max (uS)
decode: 60334/1876 uS/qty: 32.161 avg 2.041 min 124.542 max (uS)
raw decode speed: 13.775 Gbits/s
raw URB handling speed: 4.890 Gbits/s
throughput: 26.705 Mbits/s
URB decode CPU usage 0.194948 %

---

F. Async + kmalloc + manually dma_sync for each packet
frames:  927
packets: 246994
empty:   33997 (13 %)
errors:  226
invalid: 65
pts: 927 early, 927 initial, 560 ok
scr: 927 count ok, 0 diff ok
sof: 0 <= sof <= 0, freq 0.000 kHz
bytes 103017167 : duration 30938
FPS: 29.96
URB: 170630/1868 uS/qty: 91.344 avg 43.167 min 1142.167 max (uS)
header: 86372/1868 uS/qty: 46.237 avg 0.000 min 163.917 max (uS)
latency: 109148/1868 uS/qty: 58.430 avg 35.292 min 1106.583 max (uS)
decode: 61482/1868 uS/qty: 32.913 avg 2.334 min 215.833 max (uS)
raw decode speed: 13.510 Gbits/s
raw URB handling speed: 4.847 Gbits/s
throughput: 26.638 Mbits/s
URB decode CPU usage 0.198726 %

----------------------------------------

I hope this helps.

Best regards,
Keiichi
On Thu, Aug 9, 2018 at 11:12 PM Alan Stern <stern@rowland.harvard.edu> wrote:
>
> On Thu, 9 Aug 2018, Laurent Pinchart wrote:
>
> > > >> There is no need to wonder.  "Frequent DMA mapping/Cached memory" is
> > > >> always faster than "No DMA mapping/Uncached memory".
> > > >
> > > > Is it really, doesn't it depend on the CPU access pattern ?
> > >
> > > Well, if your access pattern involves transferring data in from the
> > > device and then throwing it away without reading it, you might get a
> > > different result.  :-)  But assuming you plan to read the data after
> > > transferring it, using uncached memory slows things down so much that
> > > the overhead of DMA mapping/unmapping is negligible by comparison.
> >
> > :-) I suppose it would also depend on the access pattern, if I only need to
> > access part of the buffer, performance figures may vary. In this case however
> > the whole buffer needs to be copied.
> >
> > > The only exception might be if you were talking about very small
> > > amounts of data.  I don't know exactly where the crossover occurs, but
> > > bear in mind that Matwey's tests required ~50 us for mapping/unmapping
> > > and 3000 us for accessing uncached memory.  He didn't say how large the
> > > transfers were, but that's still a pretty big difference.
> >
> > For UVC devices using bulk endpoints data buffers are typically tens of kBs.
> > For devices using isochronous endpoints, that goes down to possibly hundreds
> > of bytes for some buffers. Devices can send less data than the maximum packet
> > size, and mapping/unmapping would still invalidate the cache for the whole
> > buffer. If we keep the mappings around and use the DMA sync API, we could
> > possibly restrict the cache invalidation to the portion of the buffer actually
> > written to.
>
> Furthermore, invalidating a cache is likely to require less overhead
> than using non-cacheable memory.  After the cache has been invalidated,
> it can be repopulated relatively quickly (an entire cache line at a
> time), whereas reading uncached memory requires a slow transaction for
> each individual read operation.
>
> I think adding support to the USB core for
> dma_sync_single_for_{cpu|device} would be a good approach.  In fact, I
> wonder whether using coherent mappings provides any benefit at all.
>
> Alan Stern
>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC PATCH v1] media: uvcvideo: Cache URB header data before processing
  2018-08-24  6:06                   ` Keiichi Watanabe
@ 2018-08-24 22:00                     ` Kieran Bingham
  0 siblings, 0 replies; 23+ messages in thread
From: Kieran Bingham @ 2018-08-24 22:00 UTC (permalink / raw)
  To: Keiichi Watanabe, Alan Stern
  Cc: Laurent Pinchart, Tomasz Figa, Linux Kernel Mailing List,
	Mauro Carvalho Chehab, Linux Media Mailing List,
	Douglas Anderson, ezequiel, Matwey V. Kornilov

Hi Keiichi,

On 24/08/18 07:06, Keiichi Watanabe wrote:
> Hi all.
> 
> We performed two types of experiments.
> 
> In the first experiment, we compared the performance of uvcvideo by
> changing a way of memory allocation, usb_alloc_coherent and kmalloc.
> At the same time, we changed conditions by enabling/disabling
> asynchronous memory copy suggested by Kieran in [1].
> 
> The second experiment is a comparison between dma_unmap/map and dma_sync.
> Here, DMA mapping is done manually in uvc handlers. This is similar to
> Matwey's patch for pwc.
> https://patchwork.kernel.org/patch/10468937/.
> 
> Raw data are pasted after descriptions of the experiments.

Thank you for sharing the data and test cases.


> # Settings
> The test device was Jerry Chromebook (RK3288) with Logitech Brio 4K.
> We did video capturing at
> https://webrtc.github.io/samples/src/content/getusermedia/resolution/
> with Full HD resolution in about 30 seconds for each condition.
> 
> For logging statistics, I used Kieran's patch [2].
> 
> ## Exp. 1
> Here, we have two parameters, way of memory allocation and
> enabling/disabling async memcopy.
> So, there were 4 combinations:
> 
> A. No Async + usb_alloc_coherent (with my patch caching header data)
>    Patch: [3] + [4]
>    Since Kieran's async patches are already merged into ChromeOS's
> kernel, we disabled it by [3].
>    Patch [4] is an updated version of my patch.
> 
> B. No Async + kmalloc
>    Patch: [3] + [5]
>    [5] just adds '#define CONFIG_DMA_NONCOHERENT' at the beginning of
> uvc_video.c to use kmalloc.
> 
> C. Async + usb_alloc_coherent (with my patch caching header data)
>    Patch: [4]
> 
> D. Async + kmalloc
>    Patch: [5]
> 
> ## Exp. 2
> The conditions of the second experiment are based on condition D,
> where URB buffers are allocated by kmalloc and Kieran's asynchronous
> patches are enabled.
> 
> E. Async + kmalloc + manually unmap/map for each packet
>    Patch: [6]
>    URB_NO_TRANSFER_DMA_MAP flag is used here. dma_map and dma_unmap
> are explicitly called manually in uvc_video.c.
> 
> F. Async + kmalloc + manually sync for each packet
>    Patch: [7]
>    In uvc_video_complete, dma_single_for_cpu is called instead of
> dma_unmap_single and dma_map_single.
> 
> Note that the elapsed times for E and F cannot be compared with those
> for D in a simple way.
> This is because we don't measure elapsed time of functions outside of
> uvcvideo.c by [2].
> For example, while DMA-unmapping for each packet is done before
> uvc_video_complete is called at the condition D,
> it's done in uvc_video_complete at the condition E.
> 
> # References for patches
> [1] Asynchronous UVC
> https://www.mail-archive.com/linux-media@vger.kernel.org/msg128359.html
> 
> [2] Kieran's patch for measuring the performance of uvcvideo.
> https://git.kernel.org/pub/scm/linux/kernel/git/kbingham/rcar.git/commit/?h=uvc/async-ml&id=cebbd1b629bbe5f856ec5dc7591478c003f5a944
> I used the modified version of it for ChromeOS, but almost same.
> http://crrev.com/c/1184597
> The main difference is that our patch uses do_div instead of / and %.

Ah yes, I keep meaning to do this conversion, ever since the build-bots
warned me ...


> 
> [3] Disable asynchronous decoding
> http://crrev.com/c/1184658
> 
> [4] Cache URB header data before processing
> http://crrev.com/c/1179554
> This is an updated version of my patch I sent at the begging of this thread.
> I applied Kieran's review comments.
> 
> [5] Use kmalloc for urb buffer
> http://crrev.com/c/1184643
> 
> [6] Manually DMA dma_unmap/map for each packet
> http://crrev.com/c/1186293
> 
> [7] Manually DMA sync for each packet
> http://crrev.com/c/1186214
> 
> # Results
> 
> For the meanings of each value, please see Kieran's patch:
> https://git.kernel.org/pub/scm/linux/kernel/git/kbingham/rcar.git/commit/?h=uvc/async-ml&id=cebbd1b629bbe5f856ec5dc7591478c003f5a944
> 
> ## Exp. 1
> 
> A. No Async + usb_alloc_coherent (with my patch caching header data)
> frames:  1121
> packets: 233471
> empty:   44729 (19 %)
> errors:  34801
> invalid: 8017
> pts: 1121 early, 986 initial, 1121 ok
> scr: 1121 count ok, 111 diff ok
> sof: 0 <= sof <= 0, freq 0.000 kHz
> bytes 135668717 : duration 32907
> FPS: 34.06
> URB: 427489/3048 uS/qty: 140.252 avg 2.625 min 660.625 max (uS)
> header: 440868/3048 uS/qty: 144.641 avg 0.000 min 674.625 max (uS)
> latency: 30703/3048 uS/qty: 10.073 avg 0.875 min 26.541 max (uS)
> decode: 396785/3048 uS/qty: 130.179 avg 0.875 min 634.375 max (uS)
> raw decode speed: 2.740 Gbits/s
> raw URB handling speed: 2.541 Gbits/s
> throughput: 32.982 Mbits/s
> URB decode CPU usage 1.205779 %
> 
> ---
> 
> B. No Async memcpy + kmalloc
> frames:  949
> packets: 243665
> empty:   47804 (19 %)
> errors:  2406
> invalid: 1058
> pts: 949 early, 927 initial, 860 ok
> scr: 949 count ok, 17 diff ok
> sof: 0 <= sof <= 0, freq 0.000 kHz
> bytes 145563878 : duration 30939
> FPS: 30.67
> URB: 107265/2448 uS/qty: 43.817 avg 3.791 min 212.042 max (uS)
> header: 192608/2448 uS/qty: 78.679 avg 0.000 min 471.625 max (uS)
> latency: 24860/2448 uS/qty: 10.155 avg 1.750 min 28.000 max (uS)
> decode: 82405/2448 uS/qty: 33.662 avg 1.750 min 186.084 max (uS)
> raw decode speed: 14.201 Gbits/s
> raw URB handling speed: 10.883 Gbits/s
> throughput: 37.638 Mbits/s
> URB decode CPU usage 0.266349 %
> 
> ---
> 
> C. Async + usb_alloc_coherent (with my patch caching header data)
> frames:  874
> packets: 232786
> empty:   45594 (19 %)
> errors:  46
> invalid: 48
> pts: 874 early, 873 initial, 874 ok
> scr: 874 count ok, 1 diff ok
> sof: 0 <= sof <= 0, freq 0.000 kHz
> bytes 137989497 : duration 29139
> FPS: 29.99
> URB: 577349/2301 uS/qty: 250.912 avg 24.208 min 1009.459 max (uS)
> header: 50334/2301 uS/qty: 21.874 avg 0.000 min 77.583 max (uS)
> latency: 77597/2301 uS/qty: 33.723 avg 15.458 min 172.375 max (uS)
> decode: 499751/2301 uS/qty: 217.188 avg 4.084 min 978.542 max (uS)
> raw decode speed: 2.212 Gbits/s
> raw URB handling speed: 1.913 Gbits/s
> throughput: 37.884 Mbits/s
> URB decode CPU usage 1.715060 %
> 
> ---
> 
> D. Async memcpy + kmalloc
> frames:  870
> packets: 231152
> empty:   45390 (19 %)
> errors:  171
> invalid: 160
> pts: 870 early, 870 initial, 810 ok
> scr: 870 count ok, 0 diff ok
> sof: 0 <= sof <= 0, freq 0.000 kHz
> bytes 137406842 : duration 29036
> FPS: 29.96
> URB: 160821/2258 uS/qty: 71.222 avg 15.750 min 985.542 max (uS)
> header: 40369/2258 uS/qty: 17.878 avg 0.000 min 56.292 max (uS)
> latency: 72411/2258 uS/qty: 32.068 avg 10.792 min 946.459 max (uS)
> decode: 88410/2258 uS/qty: 39.154 avg 1.458 min 246.167 max (uS)
> raw decode speed: 12.491 Gbits/s
> raw URB handling speed: 6.870 Gbits/s
> throughput: 37.858 Mbits/s
> URB decode CPU usage 0.304485 %
> 
> ----------------------------------------
> ## Exp. 2
> 
> E. Async + kmalloc + manually dma_unmap/map for each packet
> frames:  928
> packets: 247476
> empty:   34060 (13 %)
> errors:  16
> invalid: 32
> pts: 928 early, 928 initial, 163 ok
> scr: 928 count ok, 0 diff ok
> sof: 0 <= sof <= 0, freq 0.000 kHz
> bytes 103315132 : duration 30949
> FPS: 29.98
> URB: 169873/1876 uS/qty: 90.551 avg 43.750 min 289.917 max (uS)
> header: 88962/1876 uS/qty: 47.421 avg 0.000 min 113.750 max (uS)
> latency: 109539/1876 uS/qty: 58.389 avg 37.042 min 253.459 max (uS)
> decode: 60334/1876 uS/qty: 32.161 avg 2.041 min 124.542 max (uS)
> raw decode speed: 13.775 Gbits/s
> raw URB handling speed: 4.890 Gbits/s
> throughput: 26.705 Mbits/s
> URB decode CPU usage 0.194948 %
> 
> ---
> 
> F. Async + kmalloc + manually dma_sync for each packet
> frames:  927
> packets: 246994
> empty:   33997 (13 %)
> errors:  226
> invalid: 65
> pts: 927 early, 927 initial, 560 ok
> scr: 927 count ok, 0 diff ok
> sof: 0 <= sof <= 0, freq 0.000 kHz
> bytes 103017167 : duration 30938
> FPS: 29.96
> URB: 170630/1868 uS/qty: 91.344 avg 43.167 min 1142.167 max (uS)
> header: 86372/1868 uS/qty: 46.237 avg 0.000 min 163.917 max (uS)
> latency: 109148/1868 uS/qty: 58.430 avg 35.292 min 1106.583 max (uS)
> decode: 61482/1868 uS/qty: 32.913 avg 2.334 min 215.833 max (uS)
> raw decode speed: 13.510 Gbits/s
> raw URB handling speed: 4.847 Gbits/s
> throughput: 26.638 Mbits/s
> URB decode CPU usage 0.198726 %
> 
> ----------------------------------------
> 
> I hope this helps.
> 

To make this easier to interpret, I've extracted the values with [0] and
done some manual copy pasting to compile this test data into a
spreadsheet and share it on google-docs [0]:

I've gone through quickly and tried to colour code/highlight good and
bad values with some form of traffic light colour scheme.


[0] http://paste.ubuntu.com/p/W9jsCdYjpP/
[1]
https://docs.google.com/spreadsheets/d/1uPdbdVcebO9OQ0LQ8hR2LGIEySWgSnGwwhzv7LPXAlU/edit?usp=sharing


Regards

Kieran



> Best regards,
> Keiichi
> On Thu, Aug 9, 2018 at 11:12 PM Alan Stern <stern@rowland.harvard.edu> wrote:
>>
>> On Thu, 9 Aug 2018, Laurent Pinchart wrote:
>>
>>>>>> There is no need to wonder.  "Frequent DMA mapping/Cached memory" is
>>>>>> always faster than "No DMA mapping/Uncached memory".
>>>>>
>>>>> Is it really, doesn't it depend on the CPU access pattern ?
>>>>
>>>> Well, if your access pattern involves transferring data in from the
>>>> device and then throwing it away without reading it, you might get a
>>>> different result.  :-)  But assuming you plan to read the data after
>>>> transferring it, using uncached memory slows things down so much that
>>>> the overhead of DMA mapping/unmapping is negligible by comparison.
>>>
>>> :-) I suppose it would also depend on the access pattern, if I only need to
>>> access part of the buffer, performance figures may vary. In this case however
>>> the whole buffer needs to be copied.
>>>
>>>> The only exception might be if you were talking about very small
>>>> amounts of data.  I don't know exactly where the crossover occurs, but
>>>> bear in mind that Matwey's tests required ~50 us for mapping/unmapping
>>>> and 3000 us for accessing uncached memory.  He didn't say how large the
>>>> transfers were, but that's still a pretty big difference.
>>>
>>> For UVC devices using bulk endpoints data buffers are typically tens of kBs.
>>> For devices using isochronous endpoints, that goes down to possibly hundreds
>>> of bytes for some buffers. Devices can send less data than the maximum packet
>>> size, and mapping/unmapping would still invalidate the cache for the whole
>>> buffer. If we keep the mappings around and use the DMA sync API, we could
>>> possibly restrict the cache invalidation to the portion of the buffer actually
>>> written to.
>>
>> Furthermore, invalidating a cache is likely to require less overhead
>> than using non-cacheable memory.  After the cache has been invalidated,
>> it can be repopulated relatively quickly (an entire cache line at a
>> time), whereas reading uncached memory requires a slow transaction for
>> each individual read operation.
>>
>> I think adding support to the USB core for
>> dma_sync_single_for_{cpu|device} would be a good approach.  In fact, I
>> wonder whether using coherent mappings provides any benefit at all.
>>
>> Alan Stern
>>

-- 
Regards
--
Kieran

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2018-08-24 22:00 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-06-27 10:34 [RFC PATCH v1] media: uvcvideo: Cache URB header data before processing Keiichi Watanabe
2018-06-27 17:21 ` Kieran Bingham
2018-07-24  4:18   ` Keiichi Watanabe
2018-07-24 11:07     ` Kieran Bingham
2018-07-24 11:03 ` Kieran Bingham
2018-07-30 16:00 ` Laurent Pinchart
2018-07-30 16:03   ` Kieran Bingham
2018-08-08  4:08   ` Tomasz Figa
2018-08-08  8:42     ` Laurent Pinchart
2018-08-08 12:45       ` Keiichi Watanabe
2018-08-08 13:07         ` Kieran Bingham
2018-08-08 14:02           ` Keiichi Watanabe
2018-08-08 14:20         ` Alan Stern
2018-08-08 16:22           ` Laurent Pinchart
2018-08-08 16:27             ` Tomasz Figa
2018-08-08 22:33               ` Laurent Pinchart
2018-08-08 16:29             ` Ezequiel Garcia
2018-08-08 22:35               ` Laurent Pinchart
2018-08-08 17:04             ` Alan Stern
2018-08-08 22:20               ` Laurent Pinchart
2018-08-09 14:12                 ` Alan Stern
2018-08-24  6:06                   ` Keiichi Watanabe
2018-08-24 22:00                     ` Kieran Bingham

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).