[v2,3/3] rseq: optimise rseq_get_rseq_cs() and clear_rseq_cs()
diff mbox series

Message ID 20210413162240.3131033-4-eric.dumazet@gmail.com
State New
Headers show
Series
  • rseq: minor optimizations
Related show

Commit Message

Eric Dumazet April 13, 2021, 4:22 p.m. UTC
From: Eric Dumazet <edumazet@google.com>

Commit ec9c82e03a74 ("rseq: uapi: Declare rseq_cs field as union,
update includes") added regressions for our servers.

Using copy_from_user() and clear_user() for 64bit values
is suboptimal.

We can use faster put_user() and get_user().

32bit arches can be changed to use the ptr32 field,
since the padding field must always be zero.

v2: added ideas from Peter and Mathieu about making this
    generic, since my initial patch was only dealing with
    64bit arches.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Arjun Roy <arjunroy@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
---
 kernel/rseq.c | 41 +++++++++++++++++++++++++++++++++--------
 1 file changed, 33 insertions(+), 8 deletions(-)

Comments

Mathieu Desnoyers April 13, 2021, 4:54 p.m. UTC | #1
----- On Apr 13, 2021, at 12:22 PM, Eric Dumazet eric.dumazet@gmail.com wrote:

> From: Eric Dumazet <edumazet@google.com>
> 
> Commit ec9c82e03a74 ("rseq: uapi: Declare rseq_cs field as union,
> update includes") added regressions for our servers.
> 
> Using copy_from_user() and clear_user() for 64bit values
> is suboptimal.
> 
> We can use faster put_user() and get_user().
> 
> 32bit arches can be changed to use the ptr32 field,
> since the padding field must always be zero.
> 
> v2: added ideas from Peter and Mathieu about making this
>    generic, since my initial patch was only dealing with
>    64bit arches.

Ah, now I remember the reason why reading and clearing the entire 64-bit
is important: it's because we don't want to allow user-space processes to
use this change in behavior to figure out whether they are running on a
32-bit or in a 32-bit compat mode on a 64-bit kernel.

So although I'm fine with making 64-bit kernels faster, we'll want to keep
updating the entire 64-bit ptr field on 32-bit kernels as well.

Thanks,

Mathieu

> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: "Paul E. McKenney" <paulmck@kernel.org>
> Cc: Boqun Feng <boqun.feng@gmail.com>
> Cc: Arjun Roy <arjunroy@google.com>
> Cc: Ingo Molnar <mingo@kernel.org>
> ---
> kernel/rseq.c | 41 +++++++++++++++++++++++++++++++++--------
> 1 file changed, 33 insertions(+), 8 deletions(-)
> 
> diff --git a/kernel/rseq.c b/kernel/rseq.c
> index
> cfe01ab5253c1c424c0e8b25acbb6a8e1b41a5b6..f2eee3f7f5d330688c81cb2e57d47ca6b843873e
> 100644
> --- a/kernel/rseq.c
> +++ b/kernel/rseq.c
> @@ -119,23 +119,46 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t)
> 	return 0;
> }
> 
> +#ifdef CONFIG_64BIT
> +static int rseq_get_cs_ptr(struct rseq_cs __user **uptrp,
> +			   const struct rseq __user *rseq)
> +{
> +	u64 ptr;
> +
> +	if (get_user(ptr, &rseq->rseq_cs.ptr64))
> +		return -EFAULT;
> +	*uptrp = (struct rseq_cs __user *)ptr;
> +	return 0;
> +}
> +#else
> +static int rseq_get_cs_ptr(struct rseq_cs __user **uptrp,
> +			   const struct rseq __user *rseq)
> +{
> +	u32 ptr;
> +
> +	if (get_user(ptr, &rseq->rseq_cs.ptr.ptr32))
> +		return -EFAULT;
> +	*uptrp = (struct rseq_cs __user *)ptr;
> +	return 0;
> +}
> +#endif
> +
> static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
> {
> 	struct rseq_cs __user *urseq_cs;
> -	u64 ptr;
> 	u32 __user *usig;
> 	u32 sig;
> 	int ret;
> 
> -	if (copy_from_user(&ptr, &t->rseq->rseq_cs.ptr64, sizeof(ptr)))
> +	if (rseq_get_cs_ptr(&urseq_cs, t->rseq))
> 		return -EFAULT;
> -	if (!ptr) {
> +	if (!urseq_cs) {
> 		memset(rseq_cs, 0, sizeof(*rseq_cs));
> 		return 0;
> 	}
> -	if (ptr >= TASK_SIZE)
> +	if ((unsigned long)urseq_cs >= TASK_SIZE)
> 		return -EINVAL;
> -	urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr;
> +
> 	if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs)))
> 		return -EFAULT;
> 
> @@ -211,9 +234,11 @@ static int clear_rseq_cs(struct task_struct *t)
> 	 *
> 	 * Set rseq_cs to NULL.
> 	 */
> -	if (clear_user(&t->rseq->rseq_cs.ptr64, sizeof(t->rseq->rseq_cs.ptr64)))
> -		return -EFAULT;
> -	return 0;
> +#ifdef CONFIG_64BIT
> +	return put_user(0UL, &t->rseq->rseq_cs.ptr64);
> +#else
> +	return put_user(0UL, &t->rseq->rseq_cs.ptr.ptr32);
> +#endif
> }
> 
> /*
> --
> 2.31.1.295.g9ea45b61b8-goog
Eric Dumazet April 13, 2021, 4:57 p.m. UTC | #2
On Tue, Apr 13, 2021 at 6:54 PM Mathieu Desnoyers
<mathieu.desnoyers@efficios.com> wrote:
>
> ----- On Apr 13, 2021, at 12:22 PM, Eric Dumazet eric.dumazet@gmail.com wrote:
>
> > From: Eric Dumazet <edumazet@google.com>
> >
> > Commit ec9c82e03a74 ("rseq: uapi: Declare rseq_cs field as union,
> > update includes") added regressions for our servers.
> >
> > Using copy_from_user() and clear_user() for 64bit values
> > is suboptimal.
> >
> > We can use faster put_user() and get_user().
> >
> > 32bit arches can be changed to use the ptr32 field,
> > since the padding field must always be zero.
> >
> > v2: added ideas from Peter and Mathieu about making this
> >    generic, since my initial patch was only dealing with
> >    64bit arches.
>
> Ah, now I remember the reason why reading and clearing the entire 64-bit
> is important: it's because we don't want to allow user-space processes to
> use this change in behavior to figure out whether they are running on a
> 32-bit or in a 32-bit compat mode on a 64-bit kernel.
>
> So although I'm fine with making 64-bit kernels faster, we'll want to keep
> updating the entire 64-bit ptr field on 32-bit kernels as well.
>
> Thanks,
>

So... back to V1 then ?
Eric Dumazet April 13, 2021, 5:01 p.m. UTC | #3
On Tue, Apr 13, 2021 at 6:57 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Tue, Apr 13, 2021 at 6:54 PM Mathieu Desnoyers
> <mathieu.desnoyers@efficios.com> wrote:
> >
> > ----- On Apr 13, 2021, at 12:22 PM, Eric Dumazet eric.dumazet@gmail.com wrote:
> >
> > > From: Eric Dumazet <edumazet@google.com>
> > >
> > > Commit ec9c82e03a74 ("rseq: uapi: Declare rseq_cs field as union,
> > > update includes") added regressions for our servers.
> > >
> > > Using copy_from_user() and clear_user() for 64bit values
> > > is suboptimal.
> > >
> > > We can use faster put_user() and get_user().
> > >
> > > 32bit arches can be changed to use the ptr32 field,
> > > since the padding field must always be zero.
> > >
> > > v2: added ideas from Peter and Mathieu about making this
> > >    generic, since my initial patch was only dealing with
> > >    64bit arches.
> >
> > Ah, now I remember the reason why reading and clearing the entire 64-bit
> > is important: it's because we don't want to allow user-space processes to
> > use this change in behavior to figure out whether they are running on a
> > 32-bit or in a 32-bit compat mode on a 64-bit kernel.
> >
> > So although I'm fine with making 64-bit kernels faster, we'll want to keep
> > updating the entire 64-bit ptr field on 32-bit kernels as well.
> >
> > Thanks,
> >
>
> So... back to V1 then ?

Or add more stuff as in :

#ifdef CONFIG_64BIT
+       return put_user(0UL, &t->rseq->rseq_cs.ptr64);
+#else
+       return put_user(0UL, &t->rseq->rseq_cs.ptr.ptr32) |
put_user(0UL, &t->rseq->rseq_cs.ptr.padding);
+#endif
Mathieu Desnoyers April 13, 2021, 5:06 p.m. UTC | #4
----- On Apr 13, 2021, at 12:57 PM, Eric Dumazet edumazet@google.com wrote:

> On Tue, Apr 13, 2021 at 6:54 PM Mathieu Desnoyers
> <mathieu.desnoyers@efficios.com> wrote:
>>
>> ----- On Apr 13, 2021, at 12:22 PM, Eric Dumazet eric.dumazet@gmail.com wrote:
>>
>> > From: Eric Dumazet <edumazet@google.com>
>> >
>> > Commit ec9c82e03a74 ("rseq: uapi: Declare rseq_cs field as union,
>> > update includes") added regressions for our servers.
>> >
>> > Using copy_from_user() and clear_user() for 64bit values
>> > is suboptimal.
>> >
>> > We can use faster put_user() and get_user().
>> >
>> > 32bit arches can be changed to use the ptr32 field,
>> > since the padding field must always be zero.
>> >
>> > v2: added ideas from Peter and Mathieu about making this
>> >    generic, since my initial patch was only dealing with
>> >    64bit arches.
>>
>> Ah, now I remember the reason why reading and clearing the entire 64-bit
>> is important: it's because we don't want to allow user-space processes to
>> use this change in behavior to figure out whether they are running on a
>> 32-bit or in a 32-bit compat mode on a 64-bit kernel.
>>
>> So although I'm fine with making 64-bit kernels faster, we'll want to keep
>> updating the entire 64-bit ptr field on 32-bit kernels as well.
>>
>> Thanks,
>>
> 
> So... back to V1 then ?

In terms of behavior, yes. And it's probably the "easy" fix, but I hate that
it adds lots of preprocessor ifdefs into the rseq code.

But this would require auditing get_user()/put_user() for each architecture
supported by rseq to ensure they support 8-byte load/store. And it would become
an added burden on architecture maintainers wishing to add rseq support for their
architecture.

One alternative would be to implement rseq_get_user_u64 and rseq_put_user_u64
wrappers as static functions within rseq.c to hide the preprocessor ifdeffery
from the higher-level code. I try very hard to avoid mixing preprocessor ifdefs
with C code logic whenever I can.

Thanks,

Mathieu
Eric Dumazet April 13, 2021, 5:07 p.m. UTC | #5
On Tue, Apr 13, 2021 at 7:01 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Tue, Apr 13, 2021 at 6:57 PM Eric Dumazet <edumazet@google.com> wrote:
> >
> > On Tue, Apr 13, 2021 at 6:54 PM Mathieu Desnoyers
> > <mathieu.desnoyers@efficios.com> wrote:
> > >
> > > ----- On Apr 13, 2021, at 12:22 PM, Eric Dumazet eric.dumazet@gmail.com wrote:
> > >
> > > > From: Eric Dumazet <edumazet@google.com>
> > > >
> > > > Commit ec9c82e03a74 ("rseq: uapi: Declare rseq_cs field as union,
> > > > update includes") added regressions for our servers.
> > > >
> > > > Using copy_from_user() and clear_user() for 64bit values
> > > > is suboptimal.
> > > >
> > > > We can use faster put_user() and get_user().
> > > >
> > > > 32bit arches can be changed to use the ptr32 field,
> > > > since the padding field must always be zero.
> > > >
> > > > v2: added ideas from Peter and Mathieu about making this
> > > >    generic, since my initial patch was only dealing with
> > > >    64bit arches.
> > >
> > > Ah, now I remember the reason why reading and clearing the entire 64-bit
> > > is important: it's because we don't want to allow user-space processes to
> > > use this change in behavior to figure out whether they are running on a
> > > 32-bit or in a 32-bit compat mode on a 64-bit kernel.
> > >
> > > So although I'm fine with making 64-bit kernels faster, we'll want to keep
> > > updating the entire 64-bit ptr field on 32-bit kernels as well.
> > >
> > > Thanks,
> > >
> >
> > So... back to V1 then ?
>
> Or add more stuff as in :

diff against v2, WDYT ?

diff --git a/kernel/rseq.c b/kernel/rseq.c
index f2eee3f7f5d330688c81cb2e57d47ca6b843873e..537b1f684efa11069990018ffa3642c209993011
100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -136,6 +136,10 @@ static int rseq_get_cs_ptr(struct rseq_cs __user **uptrp,
 {
        u32 ptr;

+       if (get_user(ptr, &rseq->rseq_cs.ptr.padding))
+               return -EFAULT;
+       if (ptr)
+               return -EINVAL;
        if (get_user(ptr, &rseq->rseq_cs.ptr.ptr32))
                return -EFAULT;
        *uptrp = (struct rseq_cs __user *)ptr;
@@ -150,8 +154,9 @@ static int rseq_get_rseq_cs(struct task_struct *t,
struct rseq_cs *rseq_cs)
        u32 sig;
        int ret;

-       if (rseq_get_cs_ptr(&urseq_cs, t->rseq))
-               return -EFAULT;
+       ret = rseq_get_cs_ptr(&urseq_cs, t->rseq);
+       if (ret)
+               return ret;
        if (!urseq_cs) {
                memset(rseq_cs, 0, sizeof(*rseq_cs));
                return 0;
@@ -237,7 +242,8 @@ static int clear_rseq_cs(struct task_struct *t)
 #ifdef CONFIG_64BIT
        return put_user(0UL, &t->rseq->rseq_cs.ptr64);
 #else
-       return put_user(0UL, &t->rseq->rseq_cs.ptr.ptr32);
+       return put_user(0UL, &t->rseq->rseq_cs.ptr.ptr32) |
+              put_user(0UL, &t->rseq->rseq_cs.ptr.padding);
 #endif
 }
Mathieu Desnoyers April 13, 2021, 5:20 p.m. UTC | #6
----- On Apr 13, 2021, at 1:07 PM, Eric Dumazet edumazet@google.com wrote:

> On Tue, Apr 13, 2021 at 7:01 PM Eric Dumazet <edumazet@google.com> wrote:
>>
>> On Tue, Apr 13, 2021 at 6:57 PM Eric Dumazet <edumazet@google.com> wrote:
>> >
>> > On Tue, Apr 13, 2021 at 6:54 PM Mathieu Desnoyers
>> > <mathieu.desnoyers@efficios.com> wrote:
>> > >
>> > > ----- On Apr 13, 2021, at 12:22 PM, Eric Dumazet eric.dumazet@gmail.com wrote:
>> > >
>> > > > From: Eric Dumazet <edumazet@google.com>
>> > > >
>> > > > Commit ec9c82e03a74 ("rseq: uapi: Declare rseq_cs field as union,
>> > > > update includes") added regressions for our servers.
>> > > >
>> > > > Using copy_from_user() and clear_user() for 64bit values
>> > > > is suboptimal.
>> > > >
>> > > > We can use faster put_user() and get_user().
>> > > >
>> > > > 32bit arches can be changed to use the ptr32 field,
>> > > > since the padding field must always be zero.
>> > > >
>> > > > v2: added ideas from Peter and Mathieu about making this
>> > > >    generic, since my initial patch was only dealing with
>> > > >    64bit arches.
>> > >
>> > > Ah, now I remember the reason why reading and clearing the entire 64-bit
>> > > is important: it's because we don't want to allow user-space processes to
>> > > use this change in behavior to figure out whether they are running on a
>> > > 32-bit or in a 32-bit compat mode on a 64-bit kernel.
>> > >
>> > > So although I'm fine with making 64-bit kernels faster, we'll want to keep
>> > > updating the entire 64-bit ptr field on 32-bit kernels as well.
>> > >
>> > > Thanks,
>> > >
>> >
>> > So... back to V1 then ?
>>
>> Or add more stuff as in :
> 
> diff against v2, WDYT ?

I like this approach slightly better, because it moves the preprocessor ifdefs into
rseq_get_rseq_cs and clear_rseq_cs, while keeping the same behavior for a 32-bit
process running on native 32-bit kernel and as compat task on a 64-bit kernel.

That being said, I don't expect anyone to care much about performance of 32-bit
kernels, so we could use copy_from_user() on 32-bit kernels to remove special-cases
in 32-bit specific code. This would eliminate the 32-bit specific "padding" read, and
let the TASK_SIZE comparison handle the check for both 32-bit and 64-bit kernels.

As for clear_user(), I wonder whether we could simply keep using it, but change the
clear_user() macro to figure out that it can use a faster 8-byte put_user ? I find it
odd that performance optimizations which would be relevant elsewhere creep into the
rseq code.

Thanks,

Mathieu

> 
> diff --git a/kernel/rseq.c b/kernel/rseq.c
> index
> f2eee3f7f5d330688c81cb2e57d47ca6b843873e..537b1f684efa11069990018ffa3642c209993011
> 100644
> --- a/kernel/rseq.c
> +++ b/kernel/rseq.c
> @@ -136,6 +136,10 @@ static int rseq_get_cs_ptr(struct rseq_cs __user **uptrp,
> {
>        u32 ptr;
> 
> +       if (get_user(ptr, &rseq->rseq_cs.ptr.padding))
> +               return -EFAULT;
> +       if (ptr)
> +               return -EINVAL;
>        if (get_user(ptr, &rseq->rseq_cs.ptr.ptr32))
>                return -EFAULT;
>        *uptrp = (struct rseq_cs __user *)ptr;
> @@ -150,8 +154,9 @@ static int rseq_get_rseq_cs(struct task_struct *t,
> struct rseq_cs *rseq_cs)
>        u32 sig;
>        int ret;
> 
> -       if (rseq_get_cs_ptr(&urseq_cs, t->rseq))
> -               return -EFAULT;
> +       ret = rseq_get_cs_ptr(&urseq_cs, t->rseq);
> +       if (ret)
> +               return ret;
>        if (!urseq_cs) {
>                memset(rseq_cs, 0, sizeof(*rseq_cs));
>                return 0;
> @@ -237,7 +242,8 @@ static int clear_rseq_cs(struct task_struct *t)
> #ifdef CONFIG_64BIT
>        return put_user(0UL, &t->rseq->rseq_cs.ptr64);
> #else
> -       return put_user(0UL, &t->rseq->rseq_cs.ptr.ptr32);
> +       return put_user(0UL, &t->rseq->rseq_cs.ptr.ptr32) |
> +              put_user(0UL, &t->rseq->rseq_cs.ptr.padding);
> #endif
>  }
Eric Dumazet April 13, 2021, 5:33 p.m. UTC | #7
On Tue, Apr 13, 2021 at 7:20 PM Mathieu Desnoyers
<mathieu.desnoyers@efficios.com> wrote:
>
> ----- On Apr 13, 2021, at 1:07 PM, Eric Dumazet edumazet@google.com wrote:
>
> > On Tue, Apr 13, 2021 at 7:01 PM Eric Dumazet <edumazet@google.com> wrote:
> >>
> >> On Tue, Apr 13, 2021 at 6:57 PM Eric Dumazet <edumazet@google.com> wrote:
> >> >
> >> > On Tue, Apr 13, 2021 at 6:54 PM Mathieu Desnoyers
> >> > <mathieu.desnoyers@efficios.com> wrote:
> >> > >
> >> > > ----- On Apr 13, 2021, at 12:22 PM, Eric Dumazet eric.dumazet@gmail.com wrote:
> >> > >
> >> > > > From: Eric Dumazet <edumazet@google.com>
> >> > > >
> >> > > > Commit ec9c82e03a74 ("rseq: uapi: Declare rseq_cs field as union,
> >> > > > update includes") added regressions for our servers.
> >> > > >
> >> > > > Using copy_from_user() and clear_user() for 64bit values
> >> > > > is suboptimal.
> >> > > >
> >> > > > We can use faster put_user() and get_user().
> >> > > >
> >> > > > 32bit arches can be changed to use the ptr32 field,
> >> > > > since the padding field must always be zero.
> >> > > >
> >> > > > v2: added ideas from Peter and Mathieu about making this
> >> > > >    generic, since my initial patch was only dealing with
> >> > > >    64bit arches.
> >> > >
> >> > > Ah, now I remember the reason why reading and clearing the entire 64-bit
> >> > > is important: it's because we don't want to allow user-space processes to
> >> > > use this change in behavior to figure out whether they are running on a
> >> > > 32-bit or in a 32-bit compat mode on a 64-bit kernel.
> >> > >
> >> > > So although I'm fine with making 64-bit kernels faster, we'll want to keep
> >> > > updating the entire 64-bit ptr field on 32-bit kernels as well.
> >> > >
> >> > > Thanks,
> >> > >
> >> >
> >> > So... back to V1 then ?
> >>
> >> Or add more stuff as in :
> >
> > diff against v2, WDYT ?
>
> I like this approach slightly better, because it moves the preprocessor ifdefs into
> rseq_get_rseq_cs and clear_rseq_cs, while keeping the same behavior for a 32-bit
> process running on native 32-bit kernel and as compat task on a 64-bit kernel.
>
> That being said, I don't expect anyone to care much about performance of 32-bit
> kernels, so we could use copy_from_user() on 32-bit kernels to remove special-cases
> in 32-bit specific code. This would eliminate the 32-bit specific "padding" read, and
> let the TASK_SIZE comparison handle the check for both 32-bit and 64-bit kernels.
>
> As for clear_user(), I wonder whether we could simply keep using it, but change the
> clear_user() macro to figure out that it can use a faster 8-byte put_user ? I find it
> odd that performance optimizations which would be relevant elsewhere creep into the
> rseq code.


clear_user() is a maze of arch-dependent macros/functions/assembly

I guess the same could be said from  copy_in_user(), but apparently we removed
special-casing, like in commit a41e0d754240fe8ca9c4f2070bf67e3b0228aa22

Definitely it seems odd having to carefully choose between multiple methods.


>
> Thanks,
>
> Mathieu
>
> >
> > diff --git a/kernel/rseq.c b/kernel/rseq.c
> > index
> > f2eee3f7f5d330688c81cb2e57d47ca6b843873e..537b1f684efa11069990018ffa3642c209993011
> > 100644
> > --- a/kernel/rseq.c
> > +++ b/kernel/rseq.c
> > @@ -136,6 +136,10 @@ static int rseq_get_cs_ptr(struct rseq_cs __user **uptrp,
> > {
> >        u32 ptr;
> >
> > +       if (get_user(ptr, &rseq->rseq_cs.ptr.padding))
> > +               return -EFAULT;
> > +       if (ptr)
> > +               return -EINVAL;
> >        if (get_user(ptr, &rseq->rseq_cs.ptr.ptr32))
> >                return -EFAULT;
> >        *uptrp = (struct rseq_cs __user *)ptr;
> > @@ -150,8 +154,9 @@ static int rseq_get_rseq_cs(struct task_struct *t,
> > struct rseq_cs *rseq_cs)
> >        u32 sig;
> >        int ret;
> >
> > -       if (rseq_get_cs_ptr(&urseq_cs, t->rseq))
> > -               return -EFAULT;
> > +       ret = rseq_get_cs_ptr(&urseq_cs, t->rseq);
> > +       if (ret)
> > +               return ret;
> >        if (!urseq_cs) {
> >                memset(rseq_cs, 0, sizeof(*rseq_cs));
> >                return 0;
> > @@ -237,7 +242,8 @@ static int clear_rseq_cs(struct task_struct *t)
> > #ifdef CONFIG_64BIT
> >        return put_user(0UL, &t->rseq->rseq_cs.ptr64);
> > #else
> > -       return put_user(0UL, &t->rseq->rseq_cs.ptr.ptr32);
> > +       return put_user(0UL, &t->rseq->rseq_cs.ptr.ptr32) |
> > +              put_user(0UL, &t->rseq->rseq_cs.ptr.padding);
> > #endif
> >  }
>
> --
> Mathieu Desnoyers
> EfficiOS Inc.
> http://www.efficios.com
Mathieu Desnoyers April 13, 2021, 6 p.m. UTC | #8
----- On Apr 13, 2021, at 1:33 PM, Eric Dumazet edumazet@google.com wrote:

> On Tue, Apr 13, 2021 at 7:20 PM Mathieu Desnoyers
> <mathieu.desnoyers@efficios.com> wrote:
>>
>> ----- On Apr 13, 2021, at 1:07 PM, Eric Dumazet edumazet@google.com wrote:
>>
>> > On Tue, Apr 13, 2021 at 7:01 PM Eric Dumazet <edumazet@google.com> wrote:
>> >>
>> >> On Tue, Apr 13, 2021 at 6:57 PM Eric Dumazet <edumazet@google.com> wrote:
>> >> >
>> >> > On Tue, Apr 13, 2021 at 6:54 PM Mathieu Desnoyers
>> >> > <mathieu.desnoyers@efficios.com> wrote:
>> >> > >
>> >> > > ----- On Apr 13, 2021, at 12:22 PM, Eric Dumazet eric.dumazet@gmail.com wrote:
>> >> > >
>> >> > > > From: Eric Dumazet <edumazet@google.com>
>> >> > > >
>> >> > > > Commit ec9c82e03a74 ("rseq: uapi: Declare rseq_cs field as union,
>> >> > > > update includes") added regressions for our servers.
>> >> > > >
>> >> > > > Using copy_from_user() and clear_user() for 64bit values
>> >> > > > is suboptimal.
>> >> > > >
>> >> > > > We can use faster put_user() and get_user().
>> >> > > >
>> >> > > > 32bit arches can be changed to use the ptr32 field,
>> >> > > > since the padding field must always be zero.
>> >> > > >
>> >> > > > v2: added ideas from Peter and Mathieu about making this
>> >> > > >    generic, since my initial patch was only dealing with
>> >> > > >    64bit arches.
>> >> > >
>> >> > > Ah, now I remember the reason why reading and clearing the entire 64-bit
>> >> > > is important: it's because we don't want to allow user-space processes to
>> >> > > use this change in behavior to figure out whether they are running on a
>> >> > > 32-bit or in a 32-bit compat mode on a 64-bit kernel.
>> >> > >
>> >> > > So although I'm fine with making 64-bit kernels faster, we'll want to keep
>> >> > > updating the entire 64-bit ptr field on 32-bit kernels as well.
>> >> > >
>> >> > > Thanks,
>> >> > >
>> >> >
>> >> > So... back to V1 then ?
>> >>
>> >> Or add more stuff as in :
>> >
>> > diff against v2, WDYT ?
>>
>> I like this approach slightly better, because it moves the preprocessor ifdefs
>> into
>> rseq_get_rseq_cs and clear_rseq_cs, while keeping the same behavior for a 32-bit
>> process running on native 32-bit kernel and as compat task on a 64-bit kernel.
>>
>> That being said, I don't expect anyone to care much about performance of 32-bit
>> kernels, so we could use copy_from_user() on 32-bit kernels to remove
>> special-cases
>> in 32-bit specific code. This would eliminate the 32-bit specific "padding"
>> read, and
>> let the TASK_SIZE comparison handle the check for both 32-bit and 64-bit
>> kernels.
>>
>> As for clear_user(), I wonder whether we could simply keep using it, but change
>> the
>> clear_user() macro to figure out that it can use a faster 8-byte put_user ? I
>> find it
>> odd that performance optimizations which would be relevant elsewhere creep into
>> the
>> rseq code.
> 
> 
> clear_user() is a maze of arch-dependent macros/functions/assembly
> 
> I guess the same could be said from  copy_in_user(), but apparently we removed
> special-casing, like in commit a41e0d754240fe8ca9c4f2070bf67e3b0228aa22
> 
> Definitely it seems odd having to carefully choose between multiple methods.

As long as the ifdefs are localized within clearly identified wrappers in the
rseq code I don't mind doing the special-casing there.

The point which remains is that I don't think we want to optimize for speed
on 32-bit architectures when it adds special-casing and complexity to the 32-bit
build. I suspect there is less and less testing performed on 32-bit architectures
nowadays, and it's good that as much code as possible is shared between 32-bit and
64-bit builds to share the test coverage.

Thanks,

Mathieu

> 
> 
>>
>> Thanks,
>>
>> Mathieu
>>
>> >
>> > diff --git a/kernel/rseq.c b/kernel/rseq.c
>> > index
>> > f2eee3f7f5d330688c81cb2e57d47ca6b843873e..537b1f684efa11069990018ffa3642c209993011
>> > 100644
>> > --- a/kernel/rseq.c
>> > +++ b/kernel/rseq.c
>> > @@ -136,6 +136,10 @@ static int rseq_get_cs_ptr(struct rseq_cs __user **uptrp,
>> > {
>> >        u32 ptr;
>> >
>> > +       if (get_user(ptr, &rseq->rseq_cs.ptr.padding))
>> > +               return -EFAULT;
>> > +       if (ptr)
>> > +               return -EINVAL;
>> >        if (get_user(ptr, &rseq->rseq_cs.ptr.ptr32))
>> >                return -EFAULT;
>> >        *uptrp = (struct rseq_cs __user *)ptr;
>> > @@ -150,8 +154,9 @@ static int rseq_get_rseq_cs(struct task_struct *t,
>> > struct rseq_cs *rseq_cs)
>> >        u32 sig;
>> >        int ret;
>> >
>> > -       if (rseq_get_cs_ptr(&urseq_cs, t->rseq))
>> > -               return -EFAULT;
>> > +       ret = rseq_get_cs_ptr(&urseq_cs, t->rseq);
>> > +       if (ret)
>> > +               return ret;
>> >        if (!urseq_cs) {
>> >                memset(rseq_cs, 0, sizeof(*rseq_cs));
>> >                return 0;
>> > @@ -237,7 +242,8 @@ static int clear_rseq_cs(struct task_struct *t)
>> > #ifdef CONFIG_64BIT
>> >        return put_user(0UL, &t->rseq->rseq_cs.ptr64);
>> > #else
>> > -       return put_user(0UL, &t->rseq->rseq_cs.ptr.ptr32);
>> > +       return put_user(0UL, &t->rseq->rseq_cs.ptr.ptr32) |
>> > +              put_user(0UL, &t->rseq->rseq_cs.ptr.padding);
>> > #endif
>> >  }
>>
>> --
>> Mathieu Desnoyers
>> EfficiOS Inc.
> > http://www.efficios.com
Eric Dumazet April 13, 2021, 6:22 p.m. UTC | #9
On Tue, Apr 13, 2021 at 8:00 PM Mathieu Desnoyers
<mathieu.desnoyers@efficios.com> wrote:
>

> As long as the ifdefs are localized within clearly identified wrappers in the
> rseq code I don't mind doing the special-casing there.
>
> The point which remains is that I don't think we want to optimize for speed
> on 32-bit architectures when it adds special-casing and complexity to the 32-bit
> build. I suspect there is less and less testing performed on 32-bit architectures
> nowadays, and it's good that as much code as possible is shared between 32-bit and
> 64-bit builds to share the test coverage.
>

Quite frankly V1 was fine, I can't really make it looking better.

> Thanks,
>
> Mathieu
>
> >
> >
> >>
> >> Thanks,
> >>
> >> Mathieu
> >>
> >> >
> >> > diff --git a/kernel/rseq.c b/kernel/rseq.c
> >> > index
> >> > f2eee3f7f5d330688c81cb2e57d47ca6b843873e..537b1f684efa11069990018ffa3642c209993011
> >> > 100644
> >> > --- a/kernel/rseq.c
> >> > +++ b/kernel/rseq.c
> >> > @@ -136,6 +136,10 @@ static int rseq_get_cs_ptr(struct rseq_cs __user **uptrp,
> >> > {
> >> >        u32 ptr;
> >> >
> >> > +       if (get_user(ptr, &rseq->rseq_cs.ptr.padding))
> >> > +               return -EFAULT;
> >> > +       if (ptr)
> >> > +               return -EINVAL;
> >> >        if (get_user(ptr, &rseq->rseq_cs.ptr.ptr32))
> >> >                return -EFAULT;
> >> >        *uptrp = (struct rseq_cs __user *)ptr;
> >> > @@ -150,8 +154,9 @@ static int rseq_get_rseq_cs(struct task_struct *t,
> >> > struct rseq_cs *rseq_cs)
> >> >        u32 sig;
> >> >        int ret;
> >> >
> >> > -       if (rseq_get_cs_ptr(&urseq_cs, t->rseq))
> >> > -               return -EFAULT;
> >> > +       ret = rseq_get_cs_ptr(&urseq_cs, t->rseq);
> >> > +       if (ret)
> >> > +               return ret;
> >> >        if (!urseq_cs) {
> >> >                memset(rseq_cs, 0, sizeof(*rseq_cs));
> >> >                return 0;
> >> > @@ -237,7 +242,8 @@ static int clear_rseq_cs(struct task_struct *t)
> >> > #ifdef CONFIG_64BIT
> >> >        return put_user(0UL, &t->rseq->rseq_cs.ptr64);
> >> > #else
> >> > -       return put_user(0UL, &t->rseq->rseq_cs.ptr.ptr32);
> >> > +       return put_user(0UL, &t->rseq->rseq_cs.ptr.ptr32) |
> >> > +              put_user(0UL, &t->rseq->rseq_cs.ptr.padding);
> >> > #endif
> >> >  }
> >>
> >> --
> >> Mathieu Desnoyers
> >> EfficiOS Inc.
> > > http://www.efficios.com
>
> --
> Mathieu Desnoyers
> EfficiOS Inc.
> http://www.efficios.com
Arjun Roy April 13, 2021, 6:35 p.m. UTC | #10
On Tue, Apr 13, 2021 at 11:22 AM Eric Dumazet <edumazet@google.com> wrote:
>
> On Tue, Apr 13, 2021 at 8:00 PM Mathieu Desnoyers
> <mathieu.desnoyers@efficios.com> wrote:
> >
>
> > As long as the ifdefs are localized within clearly identified wrappers in the
> > rseq code I don't mind doing the special-casing there.
> >
> > The point which remains is that I don't think we want to optimize for speed
> > on 32-bit architectures when it adds special-casing and complexity to the 32-bit
> > build. I suspect there is less and less testing performed on 32-bit architectures
> > nowadays, and it's good that as much code as possible is shared between 32-bit and
> > 64-bit builds to share the test coverage.
> >
>
> Quite frankly V1 was fine, I can't really make it looking better.
>
> > Thanks,
> >
> > Mathieu
> >
> > >
> > >
> > >>
> > >> Thanks,
> > >>
> > >> Mathieu
> > >>

If we're special-casing 64-bit architectures anyways - unrolling the
32B copy_from_user() for struct rseq_cs appears to be roughly 5-10%
savings on x86-64 when I measured it (well, in a microbenchmark, not
in rseq_get_rseq_cs() directly). Perhaps that could be an additional
avenue for improvement here.

-Arjun

> > >> >
> > >> > diff --git a/kernel/rseq.c b/kernel/rseq.c
> > >> > index
> > >> > f2eee3f7f5d330688c81cb2e57d47ca6b843873e..537b1f684efa11069990018ffa3642c209993011
> > >> > 100644
> > >> > --- a/kernel/rseq.c
> > >> > +++ b/kernel/rseq.c
> > >> > @@ -136,6 +136,10 @@ static int rseq_get_cs_ptr(struct rseq_cs __user **uptrp,
> > >> > {
> > >> >        u32 ptr;
> > >> >
> > >> > +       if (get_user(ptr, &rseq->rseq_cs.ptr.padding))
> > >> > +               return -EFAULT;
> > >> > +       if (ptr)
> > >> > +               return -EINVAL;
> > >> >        if (get_user(ptr, &rseq->rseq_cs.ptr.ptr32))
> > >> >                return -EFAULT;
> > >> >        *uptrp = (struct rseq_cs __user *)ptr;
> > >> > @@ -150,8 +154,9 @@ static int rseq_get_rseq_cs(struct task_struct *t,
> > >> > struct rseq_cs *rseq_cs)
> > >> >        u32 sig;
> > >> >        int ret;
> > >> >
> > >> > -       if (rseq_get_cs_ptr(&urseq_cs, t->rseq))
> > >> > -               return -EFAULT;
> > >> > +       ret = rseq_get_cs_ptr(&urseq_cs, t->rseq);
> > >> > +       if (ret)
> > >> > +               return ret;
> > >> >        if (!urseq_cs) {
> > >> >                memset(rseq_cs, 0, sizeof(*rseq_cs));
> > >> >                return 0;
> > >> > @@ -237,7 +242,8 @@ static int clear_rseq_cs(struct task_struct *t)
> > >> > #ifdef CONFIG_64BIT
> > >> >        return put_user(0UL, &t->rseq->rseq_cs.ptr64);
> > >> > #else
> > >> > -       return put_user(0UL, &t->rseq->rseq_cs.ptr.ptr32);
> > >> > +       return put_user(0UL, &t->rseq->rseq_cs.ptr.ptr32) |
> > >> > +              put_user(0UL, &t->rseq->rseq_cs.ptr.padding);
> > >> > #endif
> > >> >  }
> > >>
> > >> --
> > >> Mathieu Desnoyers
> > >> EfficiOS Inc.
> > > > http://www.efficios.com
> >
> > --
> > Mathieu Desnoyers
> > EfficiOS Inc.
> > http://www.efficios.com
Mathieu Desnoyers April 13, 2021, 7:13 p.m. UTC | #11
----- On Apr 13, 2021, at 2:22 PM, Eric Dumazet edumazet@google.com wrote:

> On Tue, Apr 13, 2021 at 8:00 PM Mathieu Desnoyers
> <mathieu.desnoyers@efficios.com> wrote:
>>
> 
>> As long as the ifdefs are localized within clearly identified wrappers in the
>> rseq code I don't mind doing the special-casing there.
>>
>> The point which remains is that I don't think we want to optimize for speed
>> on 32-bit architectures when it adds special-casing and complexity to the 32-bit
>> build. I suspect there is less and less testing performed on 32-bit
>> architectures
>> nowadays, and it's good that as much code as possible is shared between 32-bit
>> and
>> 64-bit builds to share the test coverage.
>>
> 
> Quite frankly V1 was fine, I can't really make it looking better.

Yes, I'm OK with V1 of that patch.

Thanks,

Mathieu
David Laight April 13, 2021, 9:19 p.m. UTC | #12
> If we're special-casing 64-bit architectures anyways - unrolling the
> 32B copy_from_user() for struct rseq_cs appears to be roughly 5-10%
> savings on x86-64 when I measured it (well, in a microbenchmark, not
> in rseq_get_rseq_cs() directly). Perhaps that could be an additional
> avenue for improvement here.

The killer is usually 'user copy hardening'.
It significantly slows down sendmsg() and recvmsg().
I've got measurable performance improvements by
using __copy_from_user() when the buffer since has
already been checked - but isn't a compile-time constant.

There is also scope for using _get_user() when reading
iovec[] (instead of copy_from_user()) and doing all the
bound checks (etc) in the loop.
That gives a measurable improvement for writev("/dev/null").
I must sort those patches out again.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Arjun Roy April 13, 2021, 10:03 p.m. UTC | #13
On Tue, Apr 13, 2021 at 2:19 PM David Laight <David.Laight@aculab.com> wrote:
>
> > If we're special-casing 64-bit architectures anyways - unrolling the
> > 32B copy_from_user() for struct rseq_cs appears to be roughly 5-10%
> > savings on x86-64 when I measured it (well, in a microbenchmark, not
> > in rseq_get_rseq_cs() directly). Perhaps that could be an additional
> > avenue for improvement here.
>
> The killer is usually 'user copy hardening'.
> It significantly slows down sendmsg() and recvmsg().
> I've got measurable performance improvements by
> using __copy_from_user() when the buffer since has
> already been checked - but isn't a compile-time constant.
>
> There is also scope for using _get_user() when reading
> iovec[] (instead of copy_from_user()) and doing all the
> bound checks (etc) in the loop.
> That gives a measurable improvement for writev("/dev/null").
> I must sort those patches out again.
>
>         David
>

In this case I mean replacing copy_from_user(rseq_cs, urseq_cs,
sizeof(*rseq_cs)) with  4 (x8B=32 total) unsafe_get_user() (wrapped in
user_read_access_begin/end()) which I think would just bypass user
copy hardening (as far as I can tell).

-Arjun

> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)
David Laight April 14, 2021, 7:55 a.m. UTC | #14
From: Arjun Roy
> Sent: 13 April 2021 23:04
> 
> On Tue, Apr 13, 2021 at 2:19 PM David Laight <David.Laight@aculab.com> wrote:
> >
> > > If we're special-casing 64-bit architectures anyways - unrolling the
> > > 32B copy_from_user() for struct rseq_cs appears to be roughly 5-10%
> > > savings on x86-64 when I measured it (well, in a microbenchmark, not
> > > in rseq_get_rseq_cs() directly). Perhaps that could be an additional
> > > avenue for improvement here.
> >
> > The killer is usually 'user copy hardening'.
> > It significantly slows down sendmsg() and recvmsg().
> > I've got measurable performance improvements by
> > using __copy_from_user() when the buffer since has
> > already been checked - but isn't a compile-time constant.
> >
> > There is also scope for using _get_user() when reading
> > iovec[] (instead of copy_from_user()) and doing all the
> > bound checks (etc) in the loop.
> > That gives a measurable improvement for writev("/dev/null").
> > I must sort those patches out again.
> >
> >         David
> >
> 
> In this case I mean replacing copy_from_user(rseq_cs, urseq_cs,
> sizeof(*rseq_cs)) with  4 (x8B=32 total) unsafe_get_user() (wrapped in
> user_read_access_begin/end()) which I think would just bypass user
> copy hardening (as far as I can tell).

Yes that is one advantage over any of the get_user() calls.
You also lose all the 'how shall we optimise this' checks
in copy_from_user().

Repeated unsafe_get_user() calls are crying out for an optimisation.
You get something like:
	failed = 0;
	copy();
	if (failed) goto error;
	copy();
	if (failed) goto error;
Where 'failed' is set by the fault handler.

This could be optimised to:
	failed = 0;
	copy();
	copy();
	if (failed) goto error;
Even if it faults on every invalid address it probably
doesn't matter - no one cares about that path.

I've not really looked at how it could be achieved though.

It might be that the 'asm goto with outputs' variant
manages to avoid the compare and jump.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Eric Dumazet April 14, 2021, 4 p.m. UTC | #15
On Wed, Apr 14, 2021 at 9:55 AM David Laight <David.Laight@aculab.com> wrote:
>
> From: Arjun Roy
> > Sent: 13 April 2021 23:04
> >
> > On Tue, Apr 13, 2021 at 2:19 PM David Laight <David.Laight@aculab.com> wrote:
> > >
> > > > If we're special-casing 64-bit architectures anyways - unrolling the
> > > > 32B copy_from_user() for struct rseq_cs appears to be roughly 5-10%
> > > > savings on x86-64 when I measured it (well, in a microbenchmark, not
> > > > in rseq_get_rseq_cs() directly). Perhaps that could be an additional
> > > > avenue for improvement here.
> > >
> > > The killer is usually 'user copy hardening'.
> > > It significantly slows down sendmsg() and recvmsg().
> > > I've got measurable performance improvements by
> > > using __copy_from_user() when the buffer since has
> > > already been checked - but isn't a compile-time constant.
> > >
> > > There is also scope for using _get_user() when reading
> > > iovec[] (instead of copy_from_user()) and doing all the
> > > bound checks (etc) in the loop.
> > > That gives a measurable improvement for writev("/dev/null").
> > > I must sort those patches out again.
> > >
> > >         David
> > >
> >
> > In this case I mean replacing copy_from_user(rseq_cs, urseq_cs,
> > sizeof(*rseq_cs)) with  4 (x8B=32 total) unsafe_get_user() (wrapped in
> > user_read_access_begin/end()) which I think would just bypass user
> > copy hardening (as far as I can tell).
>
> Yes that is one advantage over any of the get_user() calls.
> You also lose all the 'how shall we optimise this' checks
> in copy_from_user().
>
> Repeated unsafe_get_user() calls are crying out for an optimisation.
> You get something like:
>         failed = 0;
>         copy();
>         if (failed) goto error;
>         copy();
>         if (failed) goto error;
> Where 'failed' is set by the fault handler.
>
> This could be optimised to:
>         failed = 0;
>         copy();
>         copy();
>         if (failed) goto error;
> Even if it faults on every invalid address it probably
> doesn't matter - no one cares about that path.


On which arch are you looking at ?

On x86_64 at least, code generation is just perfect.
Not even a conditional jmp, it is all handled by exceptions (if any)

stac
copy();
copy();
clac


<out_of_line>
efault_end: do error recovery.



>
> I've not really looked at how it could be achieved though.
>
> It might be that the 'asm goto with outputs' variant
> manages to avoid the compare and jump.
>
>         David
>
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)
David Laight April 14, 2021, 4:08 p.m. UTC | #16
From: Eric Dumazet
> Sent: 14 April 2021 17:00
...
> > Repeated unsafe_get_user() calls are crying out for an optimisation.
> > You get something like:
> >         failed = 0;
> >         copy();
> >         if (failed) goto error;
> >         copy();
> >         if (failed) goto error;
> > Where 'failed' is set by the fault handler.
> >
> > This could be optimised to:
> >         failed = 0;
> >         copy();
> >         copy();
> >         if (failed) goto error;
> > Even if it faults on every invalid address it probably
> > doesn't matter - no one cares about that path.
> 
> 
> On which arch are you looking at ?
> 
> On x86_64 at least, code generation is just perfect.
> Not even a conditional jmp, it is all handled by exceptions (if any)
> 
> stac
> copy();
> copy();
> clac
> 
> 
> <out_of_line>
> efault_end: do error recovery.

It will be x86_64.
I'm definitely seeing repeated tests of (IIRC) %rdx.

It may well be because the compiler isn't very new.
Will be an Ubuntu build of 9.3.0.
Does that support 'asm goto with outputs' - which
may be the difference.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Eric Dumazet April 14, 2021, 4:10 p.m. UTC | #17
On Wed, Apr 14, 2021 at 6:08 PM David Laight <David.Laight@aculab.com> wrote:
>
> From: Eric Dumazet
> > Sent: 14 April 2021 17:00
> ...
> > > Repeated unsafe_get_user() calls are crying out for an optimisation.
> > > You get something like:
> > >         failed = 0;
> > >         copy();
> > >         if (failed) goto error;
> > >         copy();
> > >         if (failed) goto error;
> > > Where 'failed' is set by the fault handler.
> > >
> > > This could be optimised to:
> > >         failed = 0;
> > >         copy();
> > >         copy();
> > >         if (failed) goto error;
> > > Even if it faults on every invalid address it probably
> > > doesn't matter - no one cares about that path.
> >
> >
> > On which arch are you looking at ?
> >
> > On x86_64 at least, code generation is just perfect.
> > Not even a conditional jmp, it is all handled by exceptions (if any)
> >
> > stac
> > copy();
> > copy();
> > clac
> >
> >
> > <out_of_line>
> > efault_end: do error recovery.
>
> It will be x86_64.
> I'm definitely seeing repeated tests of (IIRC) %rdx.
>
> It may well be because the compiler isn't very new.
> Will be an Ubuntu build of 9.3.0.
> Does that support 'asm goto with outputs' - which
> may be the difference.
>

Yep, probably. I am using some recent clang version.

>         David
>
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)
Arjun Roy April 14, 2021, 5:15 p.m. UTC | #18
On Wed, Apr 14, 2021 at 9:10 AM Eric Dumazet <edumazet@google.com> wrote:
>
> On Wed, Apr 14, 2021 at 6:08 PM David Laight <David.Laight@aculab.com> wrote:
> >
> > From: Eric Dumazet
> > > Sent: 14 April 2021 17:00
> > ...
> > > > Repeated unsafe_get_user() calls are crying out for an optimisation.
> > > > You get something like:
> > > >         failed = 0;
> > > >         copy();
> > > >         if (failed) goto error;
> > > >         copy();
> > > >         if (failed) goto error;
> > > > Where 'failed' is set by the fault handler.
> > > >
> > > > This could be optimised to:
> > > >         failed = 0;
> > > >         copy();
> > > >         copy();
> > > >         if (failed) goto error;
> > > > Even if it faults on every invalid address it probably
> > > > doesn't matter - no one cares about that path.
> > >
> > >
> > > On which arch are you looking at ?
> > >
> > > On x86_64 at least, code generation is just perfect.
> > > Not even a conditional jmp, it is all handled by exceptions (if any)
> > >
> > > stac
> > > copy();
> > > copy();
> > > clac
> > >
> > >
> > > <out_of_line>
> > > efault_end: do error recovery.
> >
> > It will be x86_64.
> > I'm definitely seeing repeated tests of (IIRC) %rdx.
> >
> > It may well be because the compiler isn't very new.
> > Will be an Ubuntu build of 9.3.0.
> > Does that support 'asm goto with outputs' - which
> > may be the difference.
> >
>
> Yep, probably. I am using some recent clang version.
>

On x86-64 I can confirm, for me it (4 x unsafe_get_user()) compiles
down to stac + lfence + 8 x mov + clac as straight line code. And
results in roughly a 5%-10% speedup over copy_from_user().

-Arjun


> >         David
> >
> > -
> > Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> > Registration No: 1397386 (Wales)
Eric Dumazet April 14, 2021, 5:35 p.m. UTC | #19
On Wed, Apr 14, 2021 at 7:15 PM Arjun Roy <arjunroy@google.com> wrote:
>
> On Wed, Apr 14, 2021 at 9:10 AM Eric Dumazet <edumazet@google.com> wrote:
> >
> > On Wed, Apr 14, 2021 at 6:08 PM David Laight <David.Laight@aculab.com> wrote:
> > >
> > > From: Eric Dumazet
> > > > Sent: 14 April 2021 17:00
> > > ...
> > > > > Repeated unsafe_get_user() calls are crying out for an optimisation.
> > > > > You get something like:
> > > > >         failed = 0;
> > > > >         copy();
> > > > >         if (failed) goto error;
> > > > >         copy();
> > > > >         if (failed) goto error;
> > > > > Where 'failed' is set by the fault handler.
> > > > >
> > > > > This could be optimised to:
> > > > >         failed = 0;
> > > > >         copy();
> > > > >         copy();
> > > > >         if (failed) goto error;
> > > > > Even if it faults on every invalid address it probably
> > > > > doesn't matter - no one cares about that path.
> > > >
> > > >
> > > > On which arch are you looking at ?
> > > >
> > > > On x86_64 at least, code generation is just perfect.
> > > > Not even a conditional jmp, it is all handled by exceptions (if any)
> > > >
> > > > stac
> > > > copy();
> > > > copy();
> > > > clac
> > > >
> > > >
> > > > <out_of_line>
> > > > efault_end: do error recovery.
> > >
> > > It will be x86_64.
> > > I'm definitely seeing repeated tests of (IIRC) %rdx.
> > >
> > > It may well be because the compiler isn't very new.
> > > Will be an Ubuntu build of 9.3.0.
> > > Does that support 'asm goto with outputs' - which
> > > may be the difference.
> > >
> >
> > Yep, probably. I am using some recent clang version.
> >
>
> On x86-64 I can confirm, for me it (4 x unsafe_get_user()) compiles
> down to stac + lfence + 8 x mov + clac as straight line code. And
> results in roughly a 5%-10% speedup over copy_from_user().
>

But rseq_get_rseq_cs() would still need three different copies,
with 3 stac+lfence+clac sequences.

Maybe we need to enclose all __rseq_handle_notify_resume() operations
in a single section.







> -Arjun
>
>
> > >         David
> > >
> > > -
> > > Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> > > Registration No: 1397386 (Wales)
Arjun Roy April 14, 2021, 8:15 p.m. UTC | #20
On Wed, Apr 14, 2021 at 10:35 AM Eric Dumazet <edumazet@google.com> wrote:
>
> On Wed, Apr 14, 2021 at 7:15 PM Arjun Roy <arjunroy@google.com> wrote:
> >
> > On Wed, Apr 14, 2021 at 9:10 AM Eric Dumazet <edumazet@google.com> wrote:
> > >
> > > On Wed, Apr 14, 2021 at 6:08 PM David Laight <David.Laight@aculab.com> wrote:
> > > >
> > > > From: Eric Dumazet
> > > > > Sent: 14 April 2021 17:00
> > > > ...
> > > > > > Repeated unsafe_get_user() calls are crying out for an optimisation.
> > > > > > You get something like:
> > > > > >         failed = 0;
> > > > > >         copy();
> > > > > >         if (failed) goto error;
> > > > > >         copy();
> > > > > >         if (failed) goto error;
> > > > > > Where 'failed' is set by the fault handler.
> > > > > >
> > > > > > This could be optimised to:
> > > > > >         failed = 0;
> > > > > >         copy();
> > > > > >         copy();
> > > > > >         if (failed) goto error;
> > > > > > Even if it faults on every invalid address it probably
> > > > > > doesn't matter - no one cares about that path.
> > > > >
> > > > >
> > > > > On which arch are you looking at ?
> > > > >
> > > > > On x86_64 at least, code generation is just perfect.
> > > > > Not even a conditional jmp, it is all handled by exceptions (if any)
> > > > >
> > > > > stac
> > > > > copy();
> > > > > copy();
> > > > > clac
> > > > >
> > > > >
> > > > > <out_of_line>
> > > > > efault_end: do error recovery.
> > > >
> > > > It will be x86_64.
> > > > I'm definitely seeing repeated tests of (IIRC) %rdx.
> > > >
> > > > It may well be because the compiler isn't very new.
> > > > Will be an Ubuntu build of 9.3.0.
> > > > Does that support 'asm goto with outputs' - which
> > > > may be the difference.
> > > >
> > >
> > > Yep, probably. I am using some recent clang version.
> > >
> >
> > On x86-64 I can confirm, for me it (4 x unsafe_get_user()) compiles
> > down to stac + lfence + 8 x mov + clac as straight line code. And
> > results in roughly a 5%-10% speedup over copy_from_user().
> >
>
> But rseq_get_rseq_cs() would still need three different copies,
> with 3 stac+lfence+clac sequences.
>
> Maybe we need to enclose all __rseq_handle_notify_resume() operations
> in a single section.
>
>

To provide a bit of further exposition on this point, if you do 4x
unsafe_get_user() recall I mentioned a 5-10% improvement. On the other
hand, 4x normal get_user() I saw something like a 100% (ie. doubling
of sys time measured) regression.

I assume that's the fault of multiple stac+clac.

-Arjun

>
>
>
>
>
> > -Arjun
> >
> >
> > > >         David
> > > >
> > > > -
> > > > Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> > > > Registration No: 1397386 (Wales)
Eric Dumazet April 14, 2021, 8:25 p.m. UTC | #21
On Wed, Apr 14, 2021 at 10:15 PM Arjun Roy <arjunroy@google.com> wrote:
>
> On Wed, Apr 14, 2021 at 10:35 AM Eric Dumazet <edumazet@google.com> wrote:
> >
> > On Wed, Apr 14, 2021 at 7:15 PM Arjun Roy <arjunroy@google.com> wrote:
> > >
> > > On Wed, Apr 14, 2021 at 9:10 AM Eric Dumazet <edumazet@google.com> wrote:
> > > >
> > > > On Wed, Apr 14, 2021 at 6:08 PM David Laight <David.Laight@aculab.com> wrote:
> > > > >
> > > > > From: Eric Dumazet
> > > > > > Sent: 14 April 2021 17:00
> > > > > ...
> > > > > > > Repeated unsafe_get_user() calls are crying out for an optimisation.
> > > > > > > You get something like:
> > > > > > >         failed = 0;
> > > > > > >         copy();
> > > > > > >         if (failed) goto error;
> > > > > > >         copy();
> > > > > > >         if (failed) goto error;
> > > > > > > Where 'failed' is set by the fault handler.
> > > > > > >
> > > > > > > This could be optimised to:
> > > > > > >         failed = 0;
> > > > > > >         copy();
> > > > > > >         copy();
> > > > > > >         if (failed) goto error;
> > > > > > > Even if it faults on every invalid address it probably
> > > > > > > doesn't matter - no one cares about that path.
> > > > > >
> > > > > >
> > > > > > On which arch are you looking at ?
> > > > > >
> > > > > > On x86_64 at least, code generation is just perfect.
> > > > > > Not even a conditional jmp, it is all handled by exceptions (if any)
> > > > > >
> > > > > > stac
> > > > > > copy();
> > > > > > copy();
> > > > > > clac
> > > > > >
> > > > > >
> > > > > > <out_of_line>
> > > > > > efault_end: do error recovery.
> > > > >
> > > > > It will be x86_64.
> > > > > I'm definitely seeing repeated tests of (IIRC) %rdx.
> > > > >
> > > > > It may well be because the compiler isn't very new.
> > > > > Will be an Ubuntu build of 9.3.0.
> > > > > Does that support 'asm goto with outputs' - which
> > > > > may be the difference.
> > > > >
> > > >
> > > > Yep, probably. I am using some recent clang version.
> > > >
> > >
> > > On x86-64 I can confirm, for me it (4 x unsafe_get_user()) compiles
> > > down to stac + lfence + 8 x mov + clac as straight line code. And
> > > results in roughly a 5%-10% speedup over copy_from_user().
> > >
> >
> > But rseq_get_rseq_cs() would still need three different copies,
> > with 3 stac+lfence+clac sequences.
> >
> > Maybe we need to enclose all __rseq_handle_notify_resume() operations
> > in a single section.
> >
> >
>
> To provide a bit of further exposition on this point, if you do 4x
> unsafe_get_user() recall I mentioned a 5-10% improvement. On the other
> hand, 4x normal get_user() I saw something like a 100% (ie. doubling
> of sys time measured) regression.
>
> I assume that's the fault of multiple stac+clac.


I was suggesting only using unsafe_get_user() and unsafe_put_user(),
and one surrounding stac/clac

Basically what we had (partially) in our old Google kernels, before
commit 8f2817701492 ("rseq: Use get_user/put_user rather than
__get_user/__put_user")
but with all the needed modern stuff.
Arjun Roy April 14, 2021, 8:35 p.m. UTC | #22
On Wed, Apr 14, 2021 at 1:25 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Wed, Apr 14, 2021 at 10:15 PM Arjun Roy <arjunroy@google.com> wrote:
> >
> > On Wed, Apr 14, 2021 at 10:35 AM Eric Dumazet <edumazet@google.com> wrote:
> > >
> > > On Wed, Apr 14, 2021 at 7:15 PM Arjun Roy <arjunroy@google.com> wrote:
> > > >
> > > > On Wed, Apr 14, 2021 at 9:10 AM Eric Dumazet <edumazet@google.com> wrote:
> > > > >
> > > > > On Wed, Apr 14, 2021 at 6:08 PM David Laight <David.Laight@aculab.com> wrote:
> > > > > >
> > > > > > From: Eric Dumazet
> > > > > > > Sent: 14 April 2021 17:00
> > > > > > ...
> > > > > > > > Repeated unsafe_get_user() calls are crying out for an optimisation.
> > > > > > > > You get something like:
> > > > > > > >         failed = 0;
> > > > > > > >         copy();
> > > > > > > >         if (failed) goto error;
> > > > > > > >         copy();
> > > > > > > >         if (failed) goto error;
> > > > > > > > Where 'failed' is set by the fault handler.
> > > > > > > >
> > > > > > > > This could be optimised to:
> > > > > > > >         failed = 0;
> > > > > > > >         copy();
> > > > > > > >         copy();
> > > > > > > >         if (failed) goto error;
> > > > > > > > Even if it faults on every invalid address it probably
> > > > > > > > doesn't matter - no one cares about that path.
> > > > > > >
> > > > > > >
> > > > > > > On which arch are you looking at ?
> > > > > > >
> > > > > > > On x86_64 at least, code generation is just perfect.
> > > > > > > Not even a conditional jmp, it is all handled by exceptions (if any)
> > > > > > >
> > > > > > > stac
> > > > > > > copy();
> > > > > > > copy();
> > > > > > > clac
> > > > > > >
> > > > > > >
> > > > > > > <out_of_line>
> > > > > > > efault_end: do error recovery.
> > > > > >
> > > > > > It will be x86_64.
> > > > > > I'm definitely seeing repeated tests of (IIRC) %rdx.
> > > > > >
> > > > > > It may well be because the compiler isn't very new.
> > > > > > Will be an Ubuntu build of 9.3.0.
> > > > > > Does that support 'asm goto with outputs' - which
> > > > > > may be the difference.
> > > > > >
> > > > >
> > > > > Yep, probably. I am using some recent clang version.
> > > > >
> > > >
> > > > On x86-64 I can confirm, for me it (4 x unsafe_get_user()) compiles
> > > > down to stac + lfence + 8 x mov + clac as straight line code. And
> > > > results in roughly a 5%-10% speedup over copy_from_user().
> > > >
> > >
> > > But rseq_get_rseq_cs() would still need three different copies,
> > > with 3 stac+lfence+clac sequences.
> > >
> > > Maybe we need to enclose all __rseq_handle_notify_resume() operations
> > > in a single section.
> > >
> > >
> >
> > To provide a bit of further exposition on this point, if you do 4x
> > unsafe_get_user() recall I mentioned a 5-10% improvement. On the other
> > hand, 4x normal get_user() I saw something like a 100% (ie. doubling
> > of sys time measured) regression.
> >
> > I assume that's the fault of multiple stac+clac.
>
>
> I was suggesting only using unsafe_get_user() and unsafe_put_user(),
> and one surrounding stac/clac
>
> Basically what we had (partially) in our old Google kernels, before
> commit 8f2817701492 ("rseq: Use get_user/put_user rather than
> __get_user/__put_user")
> but with all the needed modern stuff.

Yep - in agreement with that approach.

-Arjun

Patch
diff mbox series

diff --git a/kernel/rseq.c b/kernel/rseq.c
index cfe01ab5253c1c424c0e8b25acbb6a8e1b41a5b6..f2eee3f7f5d330688c81cb2e57d47ca6b843873e 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -119,23 +119,46 @@  static int rseq_reset_rseq_cpu_id(struct task_struct *t)
 	return 0;
 }
 
+#ifdef CONFIG_64BIT
+static int rseq_get_cs_ptr(struct rseq_cs __user **uptrp,
+			   const struct rseq __user *rseq)
+{
+	u64 ptr;
+
+	if (get_user(ptr, &rseq->rseq_cs.ptr64))
+		return -EFAULT;
+	*uptrp = (struct rseq_cs __user *)ptr;
+	return 0;
+}
+#else
+static int rseq_get_cs_ptr(struct rseq_cs __user **uptrp,
+			   const struct rseq __user *rseq)
+{
+	u32 ptr;
+
+	if (get_user(ptr, &rseq->rseq_cs.ptr.ptr32))
+		return -EFAULT;
+	*uptrp = (struct rseq_cs __user *)ptr;
+	return 0;
+}
+#endif
+
 static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
 {
 	struct rseq_cs __user *urseq_cs;
-	u64 ptr;
 	u32 __user *usig;
 	u32 sig;
 	int ret;
 
-	if (copy_from_user(&ptr, &t->rseq->rseq_cs.ptr64, sizeof(ptr)))
+	if (rseq_get_cs_ptr(&urseq_cs, t->rseq))
 		return -EFAULT;
-	if (!ptr) {
+	if (!urseq_cs) {
 		memset(rseq_cs, 0, sizeof(*rseq_cs));
 		return 0;
 	}
-	if (ptr >= TASK_SIZE)
+	if ((unsigned long)urseq_cs >= TASK_SIZE)
 		return -EINVAL;
-	urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr;
+
 	if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs)))
 		return -EFAULT;
 
@@ -211,9 +234,11 @@  static int clear_rseq_cs(struct task_struct *t)
 	 *
 	 * Set rseq_cs to NULL.
 	 */
-	if (clear_user(&t->rseq->rseq_cs.ptr64, sizeof(t->rseq->rseq_cs.ptr64)))
-		return -EFAULT;
-	return 0;
+#ifdef CONFIG_64BIT
+	return put_user(0UL, &t->rseq->rseq_cs.ptr64);
+#else
+	return put_user(0UL, &t->rseq->rseq_cs.ptr.ptr32);
+#endif
 }
 
 /*