* [RFC] drm: Optimise drm_ioctl() for small user args
@ 2017-05-30 12:55 Chris Wilson
2017-05-30 13:24 ` Joonas Lahtinen
` (2 more replies)
0 siblings, 3 replies; 4+ messages in thread
From: Chris Wilson @ 2017-05-30 12:55 UTC (permalink / raw)
To: dri-devel; +Cc: intel-gfx
When looking at simple ioctls coupled with conveniently small user
parameters, the overhead of the syscall and drm_ioctl() present large
low hanging fruit. Profiling trivial microbenchmarks around
i915_gem_busy_ioctl, the low hanging fruit comprises of the call to
copy_user(). Those calls are only inlined by the macro where the
constant is known at compile-time, but the ioctl argument size depends
on the ioctl number. To help the compiler, explicitly add switches for
the small sizes that expand to simple moves to/from user. Doing the
multiple inlines does add significant code bloat, so it is very
debatable as to its value. Back to the trivial, but frequently used,
example of i915_gem_busy_ioctl() on a Broadwell avoiding the call gives
us a 15-25% improvement:
before after
single 100.173ns 84.496ns
parallel (x4) 204.275ns 152.957ns
On a baby Broxton nuc:
before after
single 245.355ns 199.477ns
parallel (x2) 280.892ns 232.726ns
Looking at the cost distribution by moving an equivalent switch into
arch/x86/lib/usercopy, the overhead to the copy user is split almost
equally between the function call and the actual copy itself. It seems
copy_user_enhanced_fast_string simply is not that good at small (single
register) copies. Something as simple as
@@ -28,6 +28,9 @@ copy_user_generic(void *to, const void *from, unsigned len)
{
unsigned ret;
+ if (len <= 16)
+ return copy_user_generic_unrolled(to, from, len);
is enough to speed up i915_gem_busy_ioctl() by 10% :|
Note that this overhead may entirely be x86 specific.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
drivers/gpu/drm/drm_ioctl.c | 111 ++++++++++++++++++++++++++++++++------------
1 file changed, 82 insertions(+), 29 deletions(-)
diff --git a/drivers/gpu/drm/drm_ioctl.c b/drivers/gpu/drm/drm_ioctl.c
index 865e3ee4d743..93ba59a30a85 100644
--- a/drivers/gpu/drm/drm_ioctl.c
+++ b/drivers/gpu/drm/drm_ioctl.c
@@ -715,11 +715,11 @@ long drm_ioctl(struct file *filp,
const struct drm_ioctl_desc *ioctl = NULL;
drm_ioctl_t *func;
unsigned int nr = DRM_IOCTL_NR(cmd);
- int retcode = -EINVAL;
char stack_kdata[128];
- char *kdata = NULL;
+ char *kdata = stack_kdata;
unsigned int in_size, out_size, drv_size, ksize;
bool is_driver_ioctl;
+ int retcode;
dev = file_priv->minor->dev;
@@ -731,12 +731,12 @@ long drm_ioctl(struct file *filp,
if (is_driver_ioctl) {
/* driver ioctl */
if (nr - DRM_COMMAND_BASE >= dev->driver->num_ioctls)
- goto err_i1;
+ goto err_invalid_ioctl;
ioctl = &dev->driver->ioctls[nr - DRM_COMMAND_BASE];
} else {
/* core ioctl */
if (nr >= DRM_CORE_IOCTL_COUNT)
- goto err_i1;
+ goto err_invalid_ioctl;
ioctl = &drm_ioctls[nr];
}
@@ -758,29 +758,50 @@ long drm_ioctl(struct file *filp,
if (unlikely(!func)) {
DRM_DEBUG("no function\n");
- retcode = -EINVAL;
- goto err_i1;
+ goto err_invalid;
}
retcode = drm_ioctl_permit(ioctl->flags, file_priv);
if (unlikely(retcode))
- goto err_i1;
-
- if (ksize <= sizeof(stack_kdata)) {
- kdata = stack_kdata;
- } else {
- kdata = kmalloc(ksize, GFP_KERNEL);
- if (!kdata) {
- retcode = -ENOMEM;
- goto err_i1;
+ goto out;
+
+ if (in_size) {
+ if (unlikely(!access_ok(VERIFY_READ, arg, in_size)))
+ goto err_invalid_user;
+
+ switch (in_size) {
+ case 4:
+ if (unlikely(__copy_from_user(kdata, (void __user *)arg,
+ 4)))
+ goto err_invalid_user;
+ break;
+ case 8:
+ if (unlikely(__copy_from_user(kdata, (void __user *)arg,
+ 8)))
+ goto err_invalid_user;
+ break;
+ case 16:
+ if (unlikely(__copy_from_user(kdata, (void __user *)arg,
+ 16)))
+ goto err_invalid_user;
+ break;
+
+ default:
+ if (ksize > sizeof(stack_kdata)) {
+ kdata = kmalloc(ksize, GFP_KERNEL);
+ if (unlikely(!kdata)) {
+ retcode = -ENOMEM;
+ goto out;
+ }
+ }
+
+ if (unlikely(__copy_from_user(kdata, (void __user *)arg,
+ in_size)))
+ goto err_invalid_user;
+ break;
}
}
- if (copy_from_user(kdata, (void __user *)arg, in_size) != 0) {
- retcode = -EFAULT;
- goto err_i1;
- }
-
if (ksize > in_size)
memset(kdata + in_size, 0, ksize - in_size);
@@ -794,21 +815,53 @@ long drm_ioctl(struct file *filp,
mutex_unlock(&drm_global_mutex);
}
- if (copy_to_user((void __user *)arg, kdata, out_size) != 0)
- retcode = -EFAULT;
-
- err_i1:
- if (!ioctl)
- DRM_DEBUG("invalid ioctl: pid=%d, dev=0x%lx, auth=%d, cmd=0x%02x, nr=0x%02x\n",
- task_pid_nr(current),
- (long)old_encode_dev(file_priv->minor->kdev->devt),
- file_priv->authenticated, cmd, nr);
+ if (out_size) {
+ if (unlikely(!access_ok(VERIFY_WRITE, arg, out_size)))
+ goto err_invalid_user;
+
+ switch (out_size) {
+ case 4:
+ if (unlikely(__copy_to_user((void __user *)arg,
+ kdata, 4)))
+ goto err_invalid_user;
+ break;
+ case 8:
+ if (unlikely(__copy_to_user((void __user *)arg,
+ kdata, 8)))
+ goto err_invalid_user;
+ break;
+ case 16:
+ if (unlikely(__copy_to_user((void __user *)arg,
+ kdata, 16)))
+ goto err_invalid_user;
+ break;
+ default:
+ if (unlikely(__copy_to_user((void __user *)arg,
+ kdata, out_size)))
+ goto err_invalid_user;
+ break;
+ }
+ }
+out:
if (kdata != stack_kdata)
kfree(kdata);
if (retcode)
DRM_DEBUG("ret = %d\n", retcode);
return retcode;
+
+err_invalid_ioctl:
+ DRM_DEBUG("invalid ioctl: pid=%d, dev=0x%lx, auth=%d, cmd=0x%02x, nr=0x%02x\n",
+ task_pid_nr(current),
+ (long)old_encode_dev(file_priv->minor->kdev->devt),
+ file_priv->authenticated, cmd, nr);
+err_invalid:
+ retcode = -EINVAL;
+ goto out;
+
+err_invalid_user:
+ retcode = -EFAULT;
+ goto out;
}
EXPORT_SYMBOL(drm_ioctl);
--
2.11.0
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [RFC] drm: Optimise drm_ioctl() for small user args
2017-05-30 12:55 [RFC] drm: Optimise drm_ioctl() for small user args Chris Wilson
@ 2017-05-30 13:24 ` Joonas Lahtinen
2017-05-30 14:27 ` ✓ Fi.CI.BAT: success for " Patchwork
2017-05-30 15:24 ` [RFC] " Chris Wilson
2 siblings, 0 replies; 4+ messages in thread
From: Joonas Lahtinen @ 2017-05-30 13:24 UTC (permalink / raw)
To: Chris Wilson, dri-devel; +Cc: intel-gfx
On ti, 2017-05-30 at 13:55 +0100, Chris Wilson wrote:
> When looking at simple ioctls coupled with conveniently small user
> parameters, the overhead of the syscall and drm_ioctl() present large
> low hanging fruit. Profiling trivial microbenchmarks around
> i915_gem_busy_ioctl, the low hanging fruit comprises of the call to
> copy_user(). Those calls are only inlined by the macro where the
> constant is known at compile-time, but the ioctl argument size depends
> on the ioctl number. To help the compiler, explicitly add switches for
> the small sizes that expand to simple moves to/from user. Doing the
> multiple inlines does add significant code bloat, so it is very
> debatable as to its value. Back to the trivial, but frequently used,
> example of i915_gem_busy_ioctl() on a Broadwell avoiding the call gives
> us a 15-25% improvement:
>
> before after
> single 100.173ns 84.496ns
> parallel (x4) 204.275ns 152.957ns
>
> On a baby Broxton nuc:
>
> before after
> single 245.355ns 199.477ns
> parallel (x2) 280.892ns 232.726ns
>
> Looking at the cost distribution by moving an equivalent switch into
> arch/x86/lib/usercopy, the overhead to the copy user is split almost
> equally between the function call and the actual copy itself. It seems
> copy_user_enhanced_fast_string simply is not that good at small (single
> register) copies. Something as simple as
>
> @@ -28,6 +28,9 @@ copy_user_generic(void *to, const void *from, unsigned len)
> {
> unsigned ret;
>
> + if (len <= 16)
> + return copy_user_generic_unrolled(to, from, len);
>
> is enough to speed up i915_gem_busy_ioctl() by 10% :|
>
> Note that this overhead may entirely be x86 specific.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
I think this should be integrated into __copy_{to,from}_user directly,
but in the meanwhile the code is;
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Regards, Joonas
--
Joonas Lahtinen
Open Source Technology Center
Intel Corporation
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply [flat|nested] 4+ messages in thread
* ✓ Fi.CI.BAT: success for drm: Optimise drm_ioctl() for small user args
2017-05-30 12:55 [RFC] drm: Optimise drm_ioctl() for small user args Chris Wilson
2017-05-30 13:24 ` Joonas Lahtinen
@ 2017-05-30 14:27 ` Patchwork
2017-05-30 15:24 ` [RFC] " Chris Wilson
2 siblings, 0 replies; 4+ messages in thread
From: Patchwork @ 2017-05-30 14:27 UTC (permalink / raw)
To: Chris Wilson; +Cc: intel-gfx
== Series Details ==
Series: drm: Optimise drm_ioctl() for small user args
URL : https://patchwork.freedesktop.org/series/25044/
State : success
== Summary ==
Series 25044v1 drm: Optimise drm_ioctl() for small user args
https://patchwork.freedesktop.org/api/1.0/series/25044/revisions/1/mbox/
Test kms_busy:
Subgroup basic-flip-default-a:
dmesg-warn -> PASS (fi-skl-6700hq) fdo#101144 +3
Test kms_cursor_legacy:
Subgroup basic-busy-flip-before-cursor-atomic:
fail -> PASS (fi-skl-6700hq) fdo#101154 +7
fdo#101144 https://bugs.freedesktop.org/show_bug.cgi?id=101144
fdo#101154 https://bugs.freedesktop.org/show_bug.cgi?id=101154
fi-bdw-5557u total:278 pass:267 dwarn:0 dfail:0 fail:0 skip:11 time:445s
fi-bdw-gvtdvm total:278 pass:256 dwarn:8 dfail:0 fail:0 skip:14 time:432s
fi-bsw-n3050 total:278 pass:242 dwarn:0 dfail:0 fail:0 skip:36 time:573s
fi-bxt-j4205 total:278 pass:259 dwarn:0 dfail:0 fail:0 skip:19 time:510s
fi-byt-j1900 total:278 pass:254 dwarn:0 dfail:0 fail:0 skip:24 time:484s
fi-byt-n2820 total:278 pass:250 dwarn:0 dfail:0 fail:0 skip:28 time:479s
fi-hsw-4770 total:278 pass:262 dwarn:0 dfail:0 fail:0 skip:16 time:429s
fi-hsw-4770r total:278 pass:262 dwarn:0 dfail:0 fail:0 skip:16 time:416s
fi-ilk-650 total:278 pass:228 dwarn:0 dfail:0 fail:0 skip:50 time:420s
fi-ivb-3520m total:278 pass:260 dwarn:0 dfail:0 fail:0 skip:18 time:489s
fi-ivb-3770 total:278 pass:260 dwarn:0 dfail:0 fail:0 skip:18 time:472s
fi-kbl-7500u total:278 pass:255 dwarn:5 dfail:0 fail:0 skip:18 time:475s
fi-kbl-7560u total:278 pass:263 dwarn:5 dfail:0 fail:0 skip:10 time:573s
fi-skl-6260u total:278 pass:268 dwarn:0 dfail:0 fail:0 skip:10 time:463s
fi-skl-6700hq total:278 pass:239 dwarn:0 dfail:1 fail:17 skip:21 time:431s
fi-skl-6700k total:278 pass:256 dwarn:4 dfail:0 fail:0 skip:18 time:467s
fi-skl-6770hq total:278 pass:268 dwarn:0 dfail:0 fail:0 skip:10 time:502s
fi-skl-gvtdvm total:278 pass:265 dwarn:0 dfail:0 fail:0 skip:13 time:437s
fi-snb-2520m total:278 pass:250 dwarn:0 dfail:0 fail:0 skip:28 time:718s
fi-snb-2600 total:278 pass:249 dwarn:0 dfail:0 fail:0 skip:29 time:402s
f98c6d553b47ea4ad8eed33a9e768d8e30d8674a drm-tip: 2017y-05m-30d-13h-06m-42s UTC integration manifest
cb00df0 drm: Optimise drm_ioctl() for small user args
== Logs ==
For more details see: https://intel-gfx-ci.01.org/CI/Patchwork_4831/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [RFC] drm: Optimise drm_ioctl() for small user args
2017-05-30 12:55 [RFC] drm: Optimise drm_ioctl() for small user args Chris Wilson
2017-05-30 13:24 ` Joonas Lahtinen
2017-05-30 14:27 ` ✓ Fi.CI.BAT: success for " Patchwork
@ 2017-05-30 15:24 ` Chris Wilson
2 siblings, 0 replies; 4+ messages in thread
From: Chris Wilson @ 2017-05-30 15:24 UTC (permalink / raw)
To: dri-devel; +Cc: intel-gfx, Joonas Lahtinen
On Tue, May 30, 2017 at 01:55:20PM +0100, Chris Wilson wrote:
> When looking at simple ioctls coupled with conveniently small user
> parameters, the overhead of the syscall and drm_ioctl() present large
> low hanging fruit. Profiling trivial microbenchmarks around
> i915_gem_busy_ioctl, the low hanging fruit comprises of the call to
> copy_user(). Those calls are only inlined by the macro where the
> constant is known at compile-time, but the ioctl argument size depends
> on the ioctl number. To help the compiler, explicitly add switches for
> the small sizes that expand to simple moves to/from user. Doing the
> multiple inlines does add significant code bloat, so it is very
> debatable as to its value. Back to the trivial, but frequently used,
> example of i915_gem_busy_ioctl() on a Broadwell avoiding the call gives
> us a 15-25% improvement:
>
> before after
> single 100.173ns 84.496ns
> parallel (x4) 204.275ns 152.957ns
>
> On a baby Broxton nuc:
>
> before after
> single 245.355ns 199.477ns
> parallel (x2) 280.892ns 232.726ns
>
> Looking at the cost distribution by moving an equivalent switch into
> arch/x86/lib/usercopy, the overhead to the copy user is split almost
> equally between the function call and the actual copy itself. It seems
> copy_user_enhanced_fast_string simply is not that good at small (single
> register) copies. Something as simple as
>
> @@ -28,6 +28,9 @@ copy_user_generic(void *to, const void *from, unsigned len)
> {
> unsigned ret;
>
> + if (len <= 16)
> + return copy_user_generic_unrolled(to, from, len);
>
> is enough to speed up i915_gem_busy_ioctl() by 10% :|
>
> Note that this overhead may entirely be x86 specific.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> ---
> + if (in_size) {
> + if (unlikely(!access_ok(VERIFY_READ, arg, in_size)))
> + goto err_invalid_user;
> +
> + switch (in_size) {
> + case 4:
> + if (unlikely(__copy_from_user(kdata, (void __user *)arg,
> + 4)))
> + goto err_invalid_user;
> + break;
> + case 8:
> + if (unlikely(__copy_from_user(kdata, (void __user *)arg,
> + 8)))
> + goto err_invalid_user;
> + break;
> + case 16:
> + if (unlikely(__copy_from_user(kdata, (void __user *)arg,
> + 16)))
> + goto err_invalid_user;
> + break;
For example, currently x86-32 only converts case 4 above. It could
trivially do case 8 as well, but by case 16 it may as well use the
function call to its loop in assembly. And currently x86-32 has no
optimisations for fixed sized puts.
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2017-05-30 15:24 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-05-30 12:55 [RFC] drm: Optimise drm_ioctl() for small user args Chris Wilson
2017-05-30 13:24 ` Joonas Lahtinen
2017-05-30 14:27 ` ✓ Fi.CI.BAT: success for " Patchwork
2017-05-30 15:24 ` [RFC] " Chris Wilson
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.