[PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
@ 2023-04-05 17:14 Stefan Berger
  2023-04-06 10:26 ` Christian Brauner
  0 siblings, 1 reply; 61+ messages in thread
From: Stefan Berger @ 2023-04-05 17:14 UTC (permalink / raw)
  To: zohar, linux-integrity, miklos
  Cc: linux-kernel, linux-security-module, linux-fsdevel,
	linux-unionfs, amir73il, Stefan Berger

Overlayfs fails to notify IMA / EVM about file content modifications
and therefore IMA-appraised files may execute even though their file
signature does not validate against the changed hash of the file
anymore. To resolve this issue, add a call to integrity_notify_change()
to the ovl_release() function to notify the integrity subsystem about
file changes. The set flag triggers the re-evaluation of the file by
IMA / EVM once the file is accessed again.

Signed-off-by: Stefan Berger <stefanb@linux.ibm.com>
---
 fs/overlayfs/file.c       |  4 ++++
 include/linux/integrity.h |  6 ++++++
 security/integrity/iint.c | 13 +++++++++++++
 3 files changed, 23 insertions(+)

diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 6011f955436b..19b8f4bcc18c 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -13,6 +13,7 @@
 #include <linux/security.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
+#include <linux/integrity.h>
 #include "overlayfs.h"
 
 struct ovl_aio_req {
@@ -169,6 +170,9 @@ static int ovl_open(struct inode *inode, struct file *file)
 
 static int ovl_release(struct inode *inode, struct file *file)
 {
+	if (file->f_flags & O_ACCMODE)
+		integrity_notify_change(inode);
+
 	fput(file->private_data);
 
 	return 0;
diff --git a/include/linux/integrity.h b/include/linux/integrity.h
index 2ea0f2f65ab6..cefdeccc1619 100644
--- a/include/linux/integrity.h
+++ b/include/linux/integrity.h
@@ -23,6 +23,7 @@ enum integrity_status {
 #ifdef CONFIG_INTEGRITY
 extern struct integrity_iint_cache *integrity_inode_get(struct inode *inode);
 extern void integrity_inode_free(struct inode *inode);
+extern void integrity_notify_change(struct inode *inode);
 extern void __init integrity_load_keys(void);
 
 #else
@@ -37,6 +38,11 @@ static inline void integrity_inode_free(struct inode *inode)
 	return;
 }
 
+static inline void integrity_notify_change(struct inode *inode)
+{
+	return;
+}
+
 static inline void integrity_load_keys(void)
 {
 }
diff --git a/security/integrity/iint.c b/security/integrity/iint.c
index 8638976f7990..70d2d716f3ae 100644
--- a/security/integrity/iint.c
+++ b/security/integrity/iint.c
@@ -85,6 +85,19 @@ static void iint_free(struct integrity_iint_cache *iint)
 	kmem_cache_free(iint_cache, iint);
 }
 
+void integrity_notify_change(struct inode *inode)
+{
+	struct integrity_iint_cache *iint;
+
+	if (!IS_IMA(inode))
+		return;
+
+	iint = integrity_iint_find(inode);
+	if (iint)
+		set_bit(IMA_CHANGE_XATTR, &iint->atomic_flags);
+}
+EXPORT_SYMBOL_GPL(integrity_notify_change);
+
 /**
  * integrity_inode_get - find or allocate an iint associated with an inode
  * @inode: pointer to the inode
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-05 17:14 [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes Stefan Berger
@ 2023-04-06 10:26 ` Christian Brauner
  2023-04-06 14:05   ` Paul Moore
  0 siblings, 1 reply; 61+ messages in thread
From: Christian Brauner @ 2023-04-06 10:26 UTC (permalink / raw)
  To: Stefan Berger
  Cc: zohar, linux-integrity, miklos, linux-kernel,
	linux-security-module, linux-fsdevel, linux-unionfs, amir73il

On Wed, Apr 05, 2023 at 01:14:49PM -0400, Stefan Berger wrote:
> Overlayfs fails to notify IMA / EVM about file content modifications
> and therefore IMA-appraised files may execute even though their file
> signature does not validate against the changed hash of the file
> anymore. To resolve this issue, add a call to integrity_notify_change()
> to the ovl_release() function to notify the integrity subsystem about
> file changes. The set flag triggers the re-evaluation of the file by
> IMA / EVM once the file is accessed again.
> 
> Signed-off-by: Stefan Berger <stefanb@linux.ibm.com>
> ---
>  fs/overlayfs/file.c       |  4 ++++
>  include/linux/integrity.h |  6 ++++++
>  security/integrity/iint.c | 13 +++++++++++++
>  3 files changed, 23 insertions(+)
> 
> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
> index 6011f955436b..19b8f4bcc18c 100644
> --- a/fs/overlayfs/file.c
> +++ b/fs/overlayfs/file.c
> @@ -13,6 +13,7 @@
>  #include <linux/security.h>
>  #include <linux/mm.h>
>  #include <linux/fs.h>
> +#include <linux/integrity.h>
>  #include "overlayfs.h"
>  
>  struct ovl_aio_req {
> @@ -169,6 +170,9 @@ static int ovl_open(struct inode *inode, struct file *file)
>  
>  static int ovl_release(struct inode *inode, struct file *file)
>  {
> +	if (file->f_flags & O_ACCMODE)
> +		integrity_notify_change(inode);
> +
>  	fput(file->private_data);
>  
>  	return 0;
> diff --git a/include/linux/integrity.h b/include/linux/integrity.h
> index 2ea0f2f65ab6..cefdeccc1619 100644
> --- a/include/linux/integrity.h
> +++ b/include/linux/integrity.h
> @@ -23,6 +23,7 @@ enum integrity_status {
>  #ifdef CONFIG_INTEGRITY
>  extern struct integrity_iint_cache *integrity_inode_get(struct inode *inode);
>  extern void integrity_inode_free(struct inode *inode);
> +extern void integrity_notify_change(struct inode *inode);

I thought we concluded that ima is going to move into the security hook
infrastructure so it seems this should be a proper LSM hook?

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-06 10:26 ` Christian Brauner
@ 2023-04-06 14:05   ` Paul Moore
  2023-04-06 14:20     ` Stefan Berger
  0 siblings, 1 reply; 61+ messages in thread
From: Paul Moore @ 2023-04-06 14:05 UTC (permalink / raw)
  To: Christian Brauner, Stefan Berger
  Cc: zohar, linux-integrity, miklos, linux-kernel,
	linux-security-module, linux-fsdevel, linux-unionfs, amir73il

On Thu, Apr 6, 2023 at 6:26 AM Christian Brauner <brauner@kernel.org> wrote:
> On Wed, Apr 05, 2023 at 01:14:49PM -0400, Stefan Berger wrote:
> > Overlayfs fails to notify IMA / EVM about file content modifications
> > and therefore IMA-appraised files may execute even though their file
> > signature does not validate against the changed hash of the file
> > anymore. To resolve this issue, add a call to integrity_notify_change()
> > to the ovl_release() function to notify the integrity subsystem about
> > file changes. The set flag triggers the re-evaluation of the file by
> > IMA / EVM once the file is accessed again.
> >
> > Signed-off-by: Stefan Berger <stefanb@linux.ibm.com>
> > ---
> >  fs/overlayfs/file.c       |  4 ++++
> >  include/linux/integrity.h |  6 ++++++
> >  security/integrity/iint.c | 13 +++++++++++++
> >  3 files changed, 23 insertions(+)
> >
> > diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
> > index 6011f955436b..19b8f4bcc18c 100644
> > --- a/fs/overlayfs/file.c
> > +++ b/fs/overlayfs/file.c
> > @@ -13,6 +13,7 @@
> >  #include <linux/security.h>
> >  #include <linux/mm.h>
> >  #include <linux/fs.h>
> > +#include <linux/integrity.h>
> >  #include "overlayfs.h"
> >
> >  struct ovl_aio_req {
> > @@ -169,6 +170,9 @@ static int ovl_open(struct inode *inode, struct file *file)
> >
> >  static int ovl_release(struct inode *inode, struct file *file)
> >  {
> > +     if (file->f_flags & O_ACCMODE)
> > +             integrity_notify_change(inode);
> > +
> >       fput(file->private_data);
> >
> >       return 0;
> > diff --git a/include/linux/integrity.h b/include/linux/integrity.h
> > index 2ea0f2f65ab6..cefdeccc1619 100644
> > --- a/include/linux/integrity.h
> > +++ b/include/linux/integrity.h
> > @@ -23,6 +23,7 @@ enum integrity_status {
> >  #ifdef CONFIG_INTEGRITY
> >  extern struct integrity_iint_cache *integrity_inode_get(struct inode *inode);
> >  extern void integrity_inode_free(struct inode *inode);
> > +extern void integrity_notify_change(struct inode *inode);
>
> I thought we concluded that ima is going to move into the security hook
> infrastructure so it seems this should be a proper LSM hook?

We are working towards migrating IMA/EVM to the LSM layer, but there
are a few things we need to fix/update/remove first; if anyone is
curious, you can join the LSM list as we've been discussing some of
these changes this week.  Bug fixes like this should probably remain
as IMA/EVM calls for the time being, with the understanding that they
will migrate over with the rest of IMA/EVM.

That said, we should give Mimi a chance to review this patch as it is
possible there is a different/better approach.  A bit of patience may
be required as I know Mimi is very busy at the moment.

-- 
paul-moore.com

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-06 14:05   ` Paul Moore
@ 2023-04-06 14:20     ` Stefan Berger
  2023-04-06 14:36       ` Paul Moore
  0 siblings, 1 reply; 61+ messages in thread
From: Stefan Berger @ 2023-04-06 14:20 UTC (permalink / raw)
  To: Paul Moore, Christian Brauner
  Cc: zohar, linux-integrity, miklos, linux-kernel,
	linux-security-module, linux-fsdevel, linux-unionfs, amir73il



On 4/6/23 10:05, Paul Moore wrote:
> On Thu, Apr 6, 2023 at 6:26 AM Christian Brauner <brauner@kernel.org> wrote:
>> On Wed, Apr 05, 2023 at 01:14:49PM -0400, Stefan Berger wrote:
>>> Overlayfs fails to notify IMA / EVM about file content modifications
>>> and therefore IMA-appraised files may execute even though their file
>>> signature does not validate against the changed hash of the file
>>> anymore. To resolve this issue, add a call to integrity_notify_change()
>>> to the ovl_release() function to notify the integrity subsystem about
>>> file changes. The set flag triggers the re-evaluation of the file by
>>> IMA / EVM once the file is accessed again.
>>>
>>> Signed-off-by: Stefan Berger <stefanb@linux.ibm.com>
>>> ---
>>>   fs/overlayfs/file.c       |  4 ++++
>>>   include/linux/integrity.h |  6 ++++++
>>>   security/integrity/iint.c | 13 +++++++++++++
>>>   3 files changed, 23 insertions(+)
>>>
>>> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
>>> index 6011f955436b..19b8f4bcc18c 100644
>>> --- a/fs/overlayfs/file.c
>>> +++ b/fs/overlayfs/file.c
>>> @@ -13,6 +13,7 @@
>>>   #include <linux/security.h>
>>>   #include <linux/mm.h>
>>>   #include <linux/fs.h>
>>> +#include <linux/integrity.h>
>>>   #include "overlayfs.h"
>>>
>>>   struct ovl_aio_req {
>>> @@ -169,6 +170,9 @@ static int ovl_open(struct inode *inode, struct file *file)
>>>
>>>   static int ovl_release(struct inode *inode, struct file *file)
>>>   {
>>> +     if (file->f_flags & O_ACCMODE)
>>> +             integrity_notify_change(inode);
>>> +
>>>        fput(file->private_data);
>>>
>>>        return 0;
>>> diff --git a/include/linux/integrity.h b/include/linux/integrity.h
>>> index 2ea0f2f65ab6..cefdeccc1619 100644
>>> --- a/include/linux/integrity.h
>>> +++ b/include/linux/integrity.h
>>> @@ -23,6 +23,7 @@ enum integrity_status {
>>>   #ifdef CONFIG_INTEGRITY
>>>   extern struct integrity_iint_cache *integrity_inode_get(struct inode *inode);
>>>   extern void integrity_inode_free(struct inode *inode);
>>> +extern void integrity_notify_change(struct inode *inode);
>>
>> I thought we concluded that ima is going to move into the security hook
>> infrastructure so it seems this should be a proper LSM hook?
> 
> We are working towards migrating IMA/EVM to the LSM layer, but there
> are a few things we need to fix/update/remove first; if anyone is
> curious, you can join the LSM list as we've been discussing some of
> these changes this week.  Bug fixes like this should probably remain
> as IMA/EVM calls for the time being, with the understanding that they
> will migrate over with the rest of IMA/EVM.
> 
> That said, we should give Mimi a chance to review this patch as it is
> possible there is a different/better approach.  A bit of patience may
> be required as I know Mimi is very busy at the moment.
> 

There may be a better approach actually by increasing the inode's i_version,
which then should trigger the appropriate path in ima_check_last_writer().

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-06 14:20     ` Stefan Berger
@ 2023-04-06 14:36       ` Paul Moore
  2023-04-06 15:01         ` Christian Brauner
  2023-04-06 16:10         ` Stefan Berger
  0 siblings, 2 replies; 61+ messages in thread
From: Paul Moore @ 2023-04-06 14:36 UTC (permalink / raw)
  To: Stefan Berger
  Cc: Christian Brauner, zohar, linux-integrity, miklos, linux-kernel,
	linux-security-module, linux-fsdevel, linux-unionfs, amir73il

On Thu, Apr 6, 2023 at 10:20 AM Stefan Berger <stefanb@linux.ibm.com> wrote:
> On 4/6/23 10:05, Paul Moore wrote:
> > On Thu, Apr 6, 2023 at 6:26 AM Christian Brauner <brauner@kernel.org> wrote:
> >> On Wed, Apr 05, 2023 at 01:14:49PM -0400, Stefan Berger wrote:
> >>> Overlayfs fails to notify IMA / EVM about file content modifications
> >>> and therefore IMA-appraised files may execute even though their file
> >>> signature does not validate against the changed hash of the file
> >>> anymore. To resolve this issue, add a call to integrity_notify_change()
> >>> to the ovl_release() function to notify the integrity subsystem about
> >>> file changes. The set flag triggers the re-evaluation of the file by
> >>> IMA / EVM once the file is accessed again.
> >>>
> >>> Signed-off-by: Stefan Berger <stefanb@linux.ibm.com>
> >>> ---
> >>>   fs/overlayfs/file.c       |  4 ++++
> >>>   include/linux/integrity.h |  6 ++++++
> >>>   security/integrity/iint.c | 13 +++++++++++++
> >>>   3 files changed, 23 insertions(+)
> >>>
> >>> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
> >>> index 6011f955436b..19b8f4bcc18c 100644
> >>> --- a/fs/overlayfs/file.c
> >>> +++ b/fs/overlayfs/file.c
> >>> @@ -13,6 +13,7 @@
> >>>   #include <linux/security.h>
> >>>   #include <linux/mm.h>
> >>>   #include <linux/fs.h>
> >>> +#include <linux/integrity.h>
> >>>   #include "overlayfs.h"
> >>>
> >>>   struct ovl_aio_req {
> >>> @@ -169,6 +170,9 @@ static int ovl_open(struct inode *inode, struct file *file)
> >>>
> >>>   static int ovl_release(struct inode *inode, struct file *file)
> >>>   {
> >>> +     if (file->f_flags & O_ACCMODE)
> >>> +             integrity_notify_change(inode);
> >>> +
> >>>        fput(file->private_data);
> >>>
> >>>        return 0;
> >>> diff --git a/include/linux/integrity.h b/include/linux/integrity.h
> >>> index 2ea0f2f65ab6..cefdeccc1619 100644
> >>> --- a/include/linux/integrity.h
> >>> +++ b/include/linux/integrity.h
> >>> @@ -23,6 +23,7 @@ enum integrity_status {
> >>>   #ifdef CONFIG_INTEGRITY
> >>>   extern struct integrity_iint_cache *integrity_inode_get(struct inode *inode);
> >>>   extern void integrity_inode_free(struct inode *inode);
> >>> +extern void integrity_notify_change(struct inode *inode);
> >>
> >> I thought we concluded that ima is going to move into the security hook
> >> infrastructure so it seems this should be a proper LSM hook?
> >
> > We are working towards migrating IMA/EVM to the LSM layer, but there
> > are a few things we need to fix/update/remove first; if anyone is
> > curious, you can join the LSM list as we've been discussing some of
> > these changes this week.  Bug fixes like this should probably remain
> > as IMA/EVM calls for the time being, with the understanding that they
> > will migrate over with the rest of IMA/EVM.
> >
> > That said, we should give Mimi a chance to review this patch as it is
> > possible there is a different/better approach.  A bit of patience may
> > be required as I know Mimi is very busy at the moment.
>
> There may be a better approach actually by increasing the inode's i_version,
> which then should trigger the appropriate path in ima_check_last_writer().

I'm not the VFS/inode expert here, but I thought the inode's i_version
field was only supposed to be bumped when the inode metadata changed,
not necessarily the file contents, right?

That said, overlayfs is a bit different so maybe that's okay, but I
think we would need to hear from the VFS folks if this is acceptable.

-- 
paul-moore.com

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-06 14:36       ` Paul Moore
@ 2023-04-06 15:01         ` Christian Brauner
  2023-04-06 18:46           ` Jeff Layton
  2023-04-06 16:10         ` Stefan Berger
  1 sibling, 1 reply; 61+ messages in thread
From: Christian Brauner @ 2023-04-06 15:01 UTC (permalink / raw)
  To: Paul Moore
  Cc: Stefan Berger, zohar, linux-integrity, miklos, linux-kernel,
	linux-security-module, linux-fsdevel, linux-unionfs, amir73il,
	Jeff Layton

On Thu, Apr 06, 2023 at 10:36:41AM -0400, Paul Moore wrote:
> On Thu, Apr 6, 2023 at 10:20 AM Stefan Berger <stefanb@linux.ibm.com> wrote:
> > On 4/6/23 10:05, Paul Moore wrote:
> > > On Thu, Apr 6, 2023 at 6:26 AM Christian Brauner <brauner@kernel.org> wrote:
> > >> On Wed, Apr 05, 2023 at 01:14:49PM -0400, Stefan Berger wrote:
> > >>> Overlayfs fails to notify IMA / EVM about file content modifications
> > >>> and therefore IMA-appraised files may execute even though their file
> > >>> signature does not validate against the changed hash of the file
> > >>> anymore. To resolve this issue, add a call to integrity_notify_change()
> > >>> to the ovl_release() function to notify the integrity subsystem about
> > >>> file changes. The set flag triggers the re-evaluation of the file by
> > >>> IMA / EVM once the file is accessed again.
> > >>>
> > >>> Signed-off-by: Stefan Berger <stefanb@linux.ibm.com>
> > >>> ---
> > >>>   fs/overlayfs/file.c       |  4 ++++
> > >>>   include/linux/integrity.h |  6 ++++++
> > >>>   security/integrity/iint.c | 13 +++++++++++++
> > >>>   3 files changed, 23 insertions(+)
> > >>>
> > >>> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
> > >>> index 6011f955436b..19b8f4bcc18c 100644
> > >>> --- a/fs/overlayfs/file.c
> > >>> +++ b/fs/overlayfs/file.c
> > >>> @@ -13,6 +13,7 @@
> > >>>   #include <linux/security.h>
> > >>>   #include <linux/mm.h>
> > >>>   #include <linux/fs.h>
> > >>> +#include <linux/integrity.h>
> > >>>   #include "overlayfs.h"
> > >>>
> > >>>   struct ovl_aio_req {
> > >>> @@ -169,6 +170,9 @@ static int ovl_open(struct inode *inode, struct file *file)
> > >>>
> > >>>   static int ovl_release(struct inode *inode, struct file *file)
> > >>>   {
> > >>> +     if (file->f_flags & O_ACCMODE)
> > >>> +             integrity_notify_change(inode);
> > >>> +
> > >>>        fput(file->private_data);
> > >>>
> > >>>        return 0;
> > >>> diff --git a/include/linux/integrity.h b/include/linux/integrity.h
> > >>> index 2ea0f2f65ab6..cefdeccc1619 100644
> > >>> --- a/include/linux/integrity.h
> > >>> +++ b/include/linux/integrity.h
> > >>> @@ -23,6 +23,7 @@ enum integrity_status {
> > >>>   #ifdef CONFIG_INTEGRITY
> > >>>   extern struct integrity_iint_cache *integrity_inode_get(struct inode *inode);
> > >>>   extern void integrity_inode_free(struct inode *inode);
> > >>> +extern void integrity_notify_change(struct inode *inode);
> > >>
> > >> I thought we concluded that ima is going to move into the security hook
> > >> infrastructure so it seems this should be a proper LSM hook?
> > >
> > > We are working towards migrating IMA/EVM to the LSM layer, but there
> > > are a few things we need to fix/update/remove first; if anyone is
> > > curious, you can join the LSM list as we've been discussing some of
> > > these changes this week.  Bug fixes like this should probably remain
> > > as IMA/EVM calls for the time being, with the understanding that they
> > > will migrate over with the rest of IMA/EVM.
> > >
> > > That said, we should give Mimi a chance to review this patch as it is
> > > possible there is a different/better approach.  A bit of patience may
> > > be required as I know Mimi is very busy at the moment.
> >
> > There may be a better approach actually by increasing the inode's i_version,
> > which then should trigger the appropriate path in ima_check_last_writer().
> 
> I'm not the VFS/inode expert here, but I thought the inode's i_version
> field was only supposed to be bumped when the inode metadata changed,
> not necessarily the file contents, right?
> 
> That said, overlayfs is a bit different so maybe that's okay, but I
> think we would need to hear from the VFS folks if this is acceptable.

Ccing Jeff for awareness since he did the i_version rework a short time ago.

The documentation in include/linux/iversion.h states:

 * [...] The i_version must
 * appear larger to observers if there was an explicit change to the inode's
 * data or metadata since it was last queried.

what I'm less sure in all of this is why this is called in ovl_release() and
whether it's correct to increment the overlayfs inode's i_version.

The change is done to the inode of the copied up/modified file's inode in the
upper layer. So the i_version should already be incremented when we call into
the upper layer usually via vfs_*() methods.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-06 14:36       ` Paul Moore
  2023-04-06 15:01         ` Christian Brauner
@ 2023-04-06 16:10         ` Stefan Berger
  1 sibling, 0 replies; 61+ messages in thread
From: Stefan Berger @ 2023-04-06 16:10 UTC (permalink / raw)
  To: Paul Moore
  Cc: Christian Brauner, zohar, linux-integrity, miklos, linux-kernel,
	linux-security-module, linux-fsdevel, linux-unionfs, amir73il



On 4/6/23 10:36, Paul Moore wrote:
> On Thu, Apr 6, 2023 at 10:20 AM Stefan Berger <stefanb@linux.ibm.com> wrote:
>> On 4/6/23 10:05, Paul Moore wrote:
>>> On Thu, Apr 6, 2023 at 6:26 AM Christian Brauner <brauner@kernel.org> wrote:
>>>> On Wed, Apr 05, 2023 at 01:14:49PM -0400, Stefan Berger wrote:
>>>>> Overlayfs fails to notify IMA / EVM about file content modifications
>>>>> and therefore IMA-appraised files may execute even though their file
>>>>> signature does not validate against the changed hash of the file
>>>>> anymore. To resolve this issue, add a call to integrity_notify_change()
>>>>> to the ovl_release() function to notify the integrity subsystem about
>>>>> file changes. The set flag triggers the re-evaluation of the file by
>>>>> IMA / EVM once the file is accessed again.
>>>>>
>>>>> Signed-off-by: Stefan Berger <stefanb@linux.ibm.com>
>>>>> ---
>>>>>    fs/overlayfs/file.c       |  4 ++++
>>>>>    include/linux/integrity.h |  6 ++++++
>>>>>    security/integrity/iint.c | 13 +++++++++++++
>>>>>    3 files changed, 23 insertions(+)
>>>>>
>>>>> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
>>>>> index 6011f955436b..19b8f4bcc18c 100644
>>>>> --- a/fs/overlayfs/file.c
>>>>> +++ b/fs/overlayfs/file.c
>>>>> @@ -13,6 +13,7 @@
>>>>>    #include <linux/security.h>
>>>>>    #include <linux/mm.h>
>>>>>    #include <linux/fs.h>
>>>>> +#include <linux/integrity.h>
>>>>>    #include "overlayfs.h"
>>>>>
>>>>>    struct ovl_aio_req {
>>>>> @@ -169,6 +170,9 @@ static int ovl_open(struct inode *inode, struct file *file)
>>>>>
>>>>>    static int ovl_release(struct inode *inode, struct file *file)
>>>>>    {
>>>>> +     if (file->f_flags & O_ACCMODE)
>>>>> +             integrity_notify_change(inode);
>>>>> +
>>>>>         fput(file->private_data);
>>>>>
>>>>>         return 0;
>>>>> diff --git a/include/linux/integrity.h b/include/linux/integrity.h
>>>>> index 2ea0f2f65ab6..cefdeccc1619 100644
>>>>> --- a/include/linux/integrity.h
>>>>> +++ b/include/linux/integrity.h
>>>>> @@ -23,6 +23,7 @@ enum integrity_status {
>>>>>    #ifdef CONFIG_INTEGRITY
>>>>>    extern struct integrity_iint_cache *integrity_inode_get(struct inode *inode);
>>>>>    extern void integrity_inode_free(struct inode *inode);
>>>>> +extern void integrity_notify_change(struct inode *inode);
>>>>
>>>> I thought we concluded that ima is going to move into the security hook
>>>> infrastructure so it seems this should be a proper LSM hook?
>>>
>>> We are working towards migrating IMA/EVM to the LSM layer, but there
>>> are a few things we need to fix/update/remove first; if anyone is
>>> curious, you can join the LSM list as we've been discussing some of
>>> these changes this week.  Bug fixes like this should probably remain
>>> as IMA/EVM calls for the time being, with the understanding that they
>>> will migrate over with the rest of IMA/EVM.
>>>
>>> That said, we should give Mimi a chance to review this patch as it is
>>> possible there is a different/better approach.  A bit of patience may
>>> be required as I know Mimi is very busy at the moment.
>>
>> There may be a better approach actually by increasing the inode's i_version,
>> which then should trigger the appropriate path in ima_check_last_writer().
> 
> I'm not the VFS/inode expert here, but I thought the inode's i_version
> field was only supposed to be bumped when the inode metadata changed,
> not necessarily the file contents, right?
> 
> That said, overlayfs is a bit different so maybe that's okay, but I
> think we would need to hear from the VFS folks if this is acceptable.
> 

Exactly.

In ima_check_last_writer() I want to trigger the code path with iint->flags &= ...



	if (atomic_read(&inode->i_writecount) == 1) {
		update = test_and_clear_bit(IMA_UPDATE_XATTR,
					    &iint->atomic_flags);
		if (!IS_I_VERSION(inode) ||
		    !inode_eq_iversion(inode, iint->version) ||
		    (iint->flags & IMA_NEW_FILE)) {
			iint->flags &= ~(IMA_DONE_MASK | IMA_NEW_FILE);
			iint->measured_pcrs = 0;
			if (update)
				ima_update_xattr(iint, file);
		}
	}


This patch here resolves it for my use case and triggers the expected code paths when
ima_file_free() -> ima_check_last_writer() is called because then the i_version is seen
as having been modified.

diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 6011f955436b..1dfe5e7bfe1c 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -13,6 +13,7 @@
  #include <linux/security.h>
  #include <linux/mm.h>
  #include <linux/fs.h>
+#include <linux/iversion.h>
  #include "overlayfs.h"

  struct ovl_aio_req {
@@ -408,6 +409,8 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
                 if (ret != -EIOCBQUEUED)
                         ovl_aio_cleanup_handler(aio_req);
         }
+       if (ret > 0)
+               inode_maybe_inc_iversion(inode, false);
  out:
         revert_creds(old_cred);
  out_fdput:



I have been testing this in a OpenBMC/Yocto environment where overlayfs is used as
root filesystem with the lower filesystem being a squashfs.

Regards,
    Stefan

^ permalink raw reply related	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-06 15:01         ` Christian Brauner
@ 2023-04-06 18:46           ` Jeff Layton
  2023-04-06 19:11             ` Stefan Berger
  0 siblings, 1 reply; 61+ messages in thread
From: Jeff Layton @ 2023-04-06 18:46 UTC (permalink / raw)
  To: Christian Brauner, Paul Moore
  Cc: Stefan Berger, zohar, linux-integrity, miklos, linux-kernel,
	linux-security-module, linux-fsdevel, linux-unionfs, amir73il

On Thu, 2023-04-06 at 17:01 +0200, Christian Brauner wrote:
> On Thu, Apr 06, 2023 at 10:36:41AM -0400, Paul Moore wrote:
> > On Thu, Apr 6, 2023 at 10:20 AM Stefan Berger <stefanb@linux.ibm.com> wrote:
> > > On 4/6/23 10:05, Paul Moore wrote:
> > > > On Thu, Apr 6, 2023 at 6:26 AM Christian Brauner <brauner@kernel.org> wrote:
> > > > > On Wed, Apr 05, 2023 at 01:14:49PM -0400, Stefan Berger wrote:
> > > > > > Overlayfs fails to notify IMA / EVM about file content modifications
> > > > > > and therefore IMA-appraised files may execute even though their file
> > > > > > signature does not validate against the changed hash of the file
> > > > > > anymore. To resolve this issue, add a call to integrity_notify_change()
> > > > > > to the ovl_release() function to notify the integrity subsystem about
> > > > > > file changes. The set flag triggers the re-evaluation of the file by
> > > > > > IMA / EVM once the file is accessed again.
> > > > > > 
> > > > > > Signed-off-by: Stefan Berger <stefanb@linux.ibm.com>
> > > > > > ---
> > > > > >   fs/overlayfs/file.c       |  4 ++++
> > > > > >   include/linux/integrity.h |  6 ++++++
> > > > > >   security/integrity/iint.c | 13 +++++++++++++
> > > > > >   3 files changed, 23 insertions(+)
> > > > > > 
> > > > > > diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
> > > > > > index 6011f955436b..19b8f4bcc18c 100644
> > > > > > --- a/fs/overlayfs/file.c
> > > > > > +++ b/fs/overlayfs/file.c
> > > > > > @@ -13,6 +13,7 @@
> > > > > >   #include <linux/security.h>
> > > > > >   #include <linux/mm.h>
> > > > > >   #include <linux/fs.h>
> > > > > > +#include <linux/integrity.h>
> > > > > >   #include "overlayfs.h"
> > > > > > 
> > > > > >   struct ovl_aio_req {
> > > > > > @@ -169,6 +170,9 @@ static int ovl_open(struct inode *inode, struct file *file)
> > > > > > 
> > > > > >   static int ovl_release(struct inode *inode, struct file *file)
> > > > > >   {
> > > > > > +     if (file->f_flags & O_ACCMODE)
> > > > > > +             integrity_notify_change(inode);
> > > > > > +
> > > > > >        fput(file->private_data);
> > > > > > 
> > > > > >        return 0;
> > > > > > diff --git a/include/linux/integrity.h b/include/linux/integrity.h
> > > > > > index 2ea0f2f65ab6..cefdeccc1619 100644
> > > > > > --- a/include/linux/integrity.h
> > > > > > +++ b/include/linux/integrity.h
> > > > > > @@ -23,6 +23,7 @@ enum integrity_status {
> > > > > >   #ifdef CONFIG_INTEGRITY
> > > > > >   extern struct integrity_iint_cache *integrity_inode_get(struct inode *inode);
> > > > > >   extern void integrity_inode_free(struct inode *inode);
> > > > > > +extern void integrity_notify_change(struct inode *inode);
> > > > > 
> > > > > I thought we concluded that ima is going to move into the security hook
> > > > > infrastructure so it seems this should be a proper LSM hook?
> > > > 
> > > > We are working towards migrating IMA/EVM to the LSM layer, but there
> > > > are a few things we need to fix/update/remove first; if anyone is
> > > > curious, you can join the LSM list as we've been discussing some of
> > > > these changes this week.  Bug fixes like this should probably remain
> > > > as IMA/EVM calls for the time being, with the understanding that they
> > > > will migrate over with the rest of IMA/EVM.
> > > > 
> > > > That said, we should give Mimi a chance to review this patch as it is
> > > > possible there is a different/better approach.  A bit of patience may
> > > > be required as I know Mimi is very busy at the moment.
> > > 
> > > There may be a better approach actually by increasing the inode's i_version,
> > > which then should trigger the appropriate path in ima_check_last_writer().
> > 
> > I'm not the VFS/inode expert here, but I thought the inode's i_version
> > field was only supposed to be bumped when the inode metadata changed,
> > not necessarily the file contents, right?
> > 

No. The i_version should change any time there is a "deliberate change"
to the file. That can be to the data or metadata, but it has to be in
response to some sort of deliberate, observable change -- something that
would cause an mtime or ctime change.

In practice, the i_version changes whenever the ctime changes, as
changing the mtime also changes the ctime.

> > That said, overlayfs is a bit different so maybe that's okay, but I
> > think we would need to hear from the VFS folks if this is acceptable.
> 
> Ccing Jeff for awareness since he did the i_version rework a short time ago.
> 
> The documentation in include/linux/iversion.h states:
> 
>  * [...] The i_version must
>  * appear larger to observers if there was an explicit change to the inode's
>  * data or metadata since it was last queried.
> 
> what I'm less sure in all of this is why this is called in ovl_release() and
> whether it's correct to increment the overlayfs inode's i_version.
>

Yeah, not sure what's special about doing this on close(). Seems like
someone could just hold the file open to prevent triggering the
remeasurement?

> The change is done to the inode of the copied up/modified file's inode in the
> upper layer. So the i_version should already be incremented when we call into
> the upper layer usually via vfs_*() methods.

Correct. As long as IMA is also measuring the upper inode then it seems
like you shouldn't need to do anything special here.

What sort of fs are you using for the upper layer?
-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-06 18:46           ` Jeff Layton
@ 2023-04-06 19:11             ` Stefan Berger
  2023-04-06 19:37               ` Jeff Layton
  0 siblings, 1 reply; 61+ messages in thread
From: Stefan Berger @ 2023-04-06 19:11 UTC (permalink / raw)
  To: Jeff Layton, Christian Brauner, Paul Moore
  Cc: zohar, linux-integrity, miklos, linux-kernel,
	linux-security-module, linux-fsdevel, linux-unionfs, amir73il



On 4/6/23 14:46, Jeff Layton wrote:
> On Thu, 2023-04-06 at 17:01 +0200, Christian Brauner wrote:
>> On Thu, Apr 06, 2023 at 10:36:41AM -0400, Paul Moore wrote:

> 
> Correct. As long as IMA is also measuring the upper inode then it seems
> like you shouldn't need to do anything special here.

Unfortunately IMA does not notice the changes. With the patch provided in the other email IMA works as expected.

>
> What sort of fs are you using for the upper layer?

jffs2:

/dev/mtdblock4 on /run/initramfs/ro type squashfs (ro,relatime,errors=continue)
/dev/mtdblock5 on /run/initramfs/rw type jffs2 (rw,relatime)
cow on / type overlay (rw,relatime,lowerdir=run/initramfs/ro,upperdir=run/initramfs/rw/cow,workdir=run/initramfs/rw/work)

Regards,
    Stefan






^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-06 19:11             ` Stefan Berger
@ 2023-04-06 19:37               ` Jeff Layton
  2023-04-06 20:22                 ` Stefan Berger
  0 siblings, 1 reply; 61+ messages in thread
From: Jeff Layton @ 2023-04-06 19:37 UTC (permalink / raw)
  To: Stefan Berger, Christian Brauner, Paul Moore
  Cc: zohar, linux-integrity, miklos, linux-kernel,
	linux-security-module, linux-fsdevel, linux-unionfs, amir73il

On Thu, 2023-04-06 at 15:11 -0400, Stefan Berger wrote:
> 
> On 4/6/23 14:46, Jeff Layton wrote:
> > On Thu, 2023-04-06 at 17:01 +0200, Christian Brauner wrote:
> > > On Thu, Apr 06, 2023 at 10:36:41AM -0400, Paul Moore wrote:
> 
> > 
> > Correct. As long as IMA is also measuring the upper inode then it seems
> > like you shouldn't need to do anything special here.
> 
> Unfortunately IMA does not notice the changes. With the patch provided in the other email IMA works as expected.
> 


It looks like remeasurement is usually done in ima_check_last_writer.
That gets called from __fput which is called when we're releasing the
last reference to the struct file.

You've hooked into the ->release op, which gets called whenever
filp_close is called, which happens when we're disassociating the file
from the file descriptor table.

So...I don't get it. Is ima_file_free not getting called on your file
for some reason when you go to close it? It seems like that should be
handling this.

In any case, I think this could use a bit more root-cause analysis.

> > 
> > What sort of fs are you using for the upper layer?
> 
> jffs2:
> 
> /dev/mtdblock4 on /run/initramfs/ro type squashfs (ro,relatime,errors=continue)
> /dev/mtdblock5 on /run/initramfs/rw type jffs2 (rw,relatime)
> cow on / type overlay (rw,relatime,lowerdir=run/initramfs/ro,upperdir=run/initramfs/rw/cow,workdir=run/initramfs/rw/work)
> 

jffs2 does not have a proper i_version counter, I'm afraid. But, IMA
should handle that OK (by assuming that it always needs to remeasure
when there is no i_version counter).
-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-06 19:37               ` Jeff Layton
@ 2023-04-06 20:22                 ` Stefan Berger
  2023-04-06 21:24                   ` Jeff Layton
  2023-04-07  6:42                   ` Amir Goldstein
  0 siblings, 2 replies; 61+ messages in thread
From: Stefan Berger @ 2023-04-06 20:22 UTC (permalink / raw)
  To: Jeff Layton, Christian Brauner, Paul Moore
  Cc: zohar, linux-integrity, miklos, linux-kernel,
	linux-security-module, linux-fsdevel, linux-unionfs, amir73il



On 4/6/23 15:37, Jeff Layton wrote:
> On Thu, 2023-04-06 at 15:11 -0400, Stefan Berger wrote:
>>
>> On 4/6/23 14:46, Jeff Layton wrote:
>>> On Thu, 2023-04-06 at 17:01 +0200, Christian Brauner wrote:
>>>> On Thu, Apr 06, 2023 at 10:36:41AM -0400, Paul Moore wrote:
>>
>>>
>>> Correct. As long as IMA is also measuring the upper inode then it seems
>>> like you shouldn't need to do anything special here.
>>
>> Unfortunately IMA does not notice the changes. With the patch provided in the other email IMA works as expected.
>>
> 
> 
> It looks like remeasurement is usually done in ima_check_last_writer.
> That gets called from __fput which is called when we're releasing the
> last reference to the struct file.
> 
> You've hooked into the ->release op, which gets called whenever
> filp_close is called, which happens when we're disassociating the file
> from the file descriptor table.
> 
> So...I don't get it. Is ima_file_free not getting called on your file
> for some reason when you go to close it? It seems like that should be
> handling this.

I would ditch the original proposal in favor of this 2-line patch shown here:

https://lore.kernel.org/linux-integrity/a95f62ed-8b8a-38e5-e468-ecbde3b221af@linux.ibm.com/T/#m3bd047c6e5c8200df1d273c0ad551c645dd43232


The new proposed i_version increase occurs on the inode that IMA sees later on for
the file that's being executed and on which it must do a re-evaluation.

Upon file changes ima_inode_free() seems to see two ima_file_free() calls,
one for what seems to be the upper layer (used for vfs_* functions below)
and once for the lower one.
The important thing is that IMA will see the lower one when the file gets
executed later on and this is the one that I instrumented now to have its
i_version increased, which in turn triggers the re-evaluation of the file post
modification.

static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
[...]
	struct fd real;
[...]
	ret = ovl_real_fdget(file, &real);
	if (ret)
		goto out_unlock;

[...]
	if (is_sync_kiocb(iocb)) {
		file_start_write(real.file);
-->		ret = vfs_iter_write(real.file, iter, &iocb->ki_pos,
				     ovl_iocb_to_rwf(ifl));
		file_end_write(real.file);
		/* Update size */
		ovl_copyattr(inode);
	} else {
		struct ovl_aio_req *aio_req;

		ret = -ENOMEM;
		aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL);
		if (!aio_req)
			goto out;

		file_start_write(real.file);
		/* Pacify lockdep, same trick as done in aio_write() */
		__sb_writers_release(file_inode(real.file)->i_sb,
				     SB_FREEZE_WRITE);
		aio_req->fd = real;
		real.flags = 0;
		aio_req->orig_iocb = iocb;
		kiocb_clone(&aio_req->iocb, iocb, real.file);
		aio_req->iocb.ki_flags = ifl;
		aio_req->iocb.ki_complete = ovl_aio_rw_complete;
		refcount_set(&aio_req->ref, 2);
-->		ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter);
		ovl_aio_put(aio_req);
		if (ret != -EIOCBQUEUED)
			ovl_aio_cleanup_handler(aio_req);
	}
         if (ret > 0)						<--- this get it to work
                 inode_maybe_inc_iversion(inode, false);		<--- since this inode is known to IMA
out:
         revert_creds(old_cred);
out_fdput:
         fdput(real);

out_unlock:
         inode_unlock(inode);




    Stefan

> 
> In any case, I think this could use a bit more root-cause analysis.


^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-06 20:22                 ` Stefan Berger
@ 2023-04-06 21:24                   ` Jeff Layton
  2023-04-06 21:58                     ` Stefan Berger
  2023-04-06 22:04                     ` Jeff Layton
  2023-04-07  6:42                   ` Amir Goldstein
  1 sibling, 2 replies; 61+ messages in thread
From: Jeff Layton @ 2023-04-06 21:24 UTC (permalink / raw)
  To: Stefan Berger, Christian Brauner, Paul Moore
  Cc: zohar, linux-integrity, miklos, linux-kernel,
	linux-security-module, linux-fsdevel, linux-unionfs, amir73il

On Thu, 2023-04-06 at 16:22 -0400, Stefan Berger wrote:
> 
> On 4/6/23 15:37, Jeff Layton wrote:
> > On Thu, 2023-04-06 at 15:11 -0400, Stefan Berger wrote:
> > > 
> > > On 4/6/23 14:46, Jeff Layton wrote:
> > > > On Thu, 2023-04-06 at 17:01 +0200, Christian Brauner wrote:
> > > > > On Thu, Apr 06, 2023 at 10:36:41AM -0400, Paul Moore wrote:
> > > 
> > > > 
> > > > Correct. As long as IMA is also measuring the upper inode then it seems
> > > > like you shouldn't need to do anything special here.
> > > 
> > > Unfortunately IMA does not notice the changes. With the patch provided in the other email IMA works as expected.
> > > 
> > 
> > 
> > It looks like remeasurement is usually done in ima_check_last_writer.
> > That gets called from __fput which is called when we're releasing the
> > last reference to the struct file.
> > 
> > You've hooked into the ->release op, which gets called whenever
> > filp_close is called, which happens when we're disassociating the file
> > from the file descriptor table.
> > 
> > So...I don't get it. Is ima_file_free not getting called on your file
> > for some reason when you go to close it? It seems like that should be
> > handling this.
> 
> I would ditch the original proposal in favor of this 2-line patch shown here:
> 
> https://lore.kernel.org/linux-integrity/a95f62ed-8b8a-38e5-e468-ecbde3b221af@linux.ibm.com/T/#m3bd047c6e5c8200df1d273c0ad551c645dd43232
> 
> 

Ok, I think I get it. IMA is trying to use the i_version from the
overlayfs inode.

I suspect that the real problem here is that IMA is just doing a bare
inode_query_iversion. Really, we ought to make IMA call
vfs_getattr_nosec (or something like it) to query the getattr routine in
the upper layer. Then overlayfs could just propagate the results from
the upper layer in its response.

That sort of design may also eventually help IMA work properly with more
exotic filesystems, like NFS or Ceph.

> The new proposed i_version increase occurs on the inode that IMA sees later on for
> the file that's being executed and on which it must do a re-evaluation.
> 
> Upon file changes ima_inode_free() seems to see two ima_file_free() calls,
> one for what seems to be the upper layer (used for vfs_* functions below)
> and once for the lower one.
> The important thing is that IMA will see the lower one when the file gets
> executed later on and this is the one that I instrumented now to have its
> i_version increased, which in turn triggers the re-evaluation of the file post
> modification.
> 
> static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
> [...]
> 	struct fd real;
> [...]
> 	ret = ovl_real_fdget(file, &real);
> 	if (ret)
> 		goto out_unlock;
> 
> [...]
> 	if (is_sync_kiocb(iocb)) {
> 		file_start_write(real.file);
> -->		ret = vfs_iter_write(real.file, iter, &iocb->ki_pos,
> 				     ovl_iocb_to_rwf(ifl));
> 		file_end_write(real.file);
> 		/* Update size */
> 		ovl_copyattr(inode);
> 	} else {
> 		struct ovl_aio_req *aio_req;
> 
> 		ret = -ENOMEM;
> 		aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL);
> 		if (!aio_req)
> 			goto out;
> 
> 		file_start_write(real.file);
> 		/* Pacify lockdep, same trick as done in aio_write() */
> 		__sb_writers_release(file_inode(real.file)->i_sb,
> 				     SB_FREEZE_WRITE);
> 		aio_req->fd = real;
> 		real.flags = 0;
> 		aio_req->orig_iocb = iocb;
> 		kiocb_clone(&aio_req->iocb, iocb, real.file);
> 		aio_req->iocb.ki_flags = ifl;
> 		aio_req->iocb.ki_complete = ovl_aio_rw_complete;
> 		refcount_set(&aio_req->ref, 2);
> -->		ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter);
> 		ovl_aio_put(aio_req);
> 		if (ret != -EIOCBQUEUED)
> 			ovl_aio_cleanup_handler(aio_req);
> 	}
>          if (ret > 0)						<--- this get it to work
>                  inode_maybe_inc_iversion(inode, false);		<--- since this inode is known to IMA
> out:
>          revert_creds(old_cred);
> out_fdput:
>          fdput(real);
> 
> out_unlock:
>          inode_unlock(inode);
> 
> 
> 
> 
>     Stefan
> 
> > 
> > In any case, I think this could use a bit more root-cause analysis.
> 

-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-06 21:24                   ` Jeff Layton
@ 2023-04-06 21:58                     ` Stefan Berger
  2023-04-06 22:09                       ` Jeff Layton
  2023-04-06 22:04                     ` Jeff Layton
  1 sibling, 1 reply; 61+ messages in thread
From: Stefan Berger @ 2023-04-06 21:58 UTC (permalink / raw)
  To: Jeff Layton, Christian Brauner, Paul Moore
  Cc: zohar, linux-integrity, miklos, linux-kernel,
	linux-security-module, linux-fsdevel, linux-unionfs, amir73il



On 4/6/23 17:24, Jeff Layton wrote:
> On Thu, 2023-04-06 at 16:22 -0400, Stefan Berger wrote:
>>
>> On 4/6/23 15:37, Jeff Layton wrote:
>>> On Thu, 2023-04-06 at 15:11 -0400, Stefan Berger wrote:
>>>>
>>>> On 4/6/23 14:46, Jeff Layton wrote:
>>>>> On Thu, 2023-04-06 at 17:01 +0200, Christian Brauner wrote:
>>>>>> On Thu, Apr 06, 2023 at 10:36:41AM -0400, Paul Moore wrote:
>>>>
>>>>>
>>>>> Correct. As long as IMA is also measuring the upper inode then it seems
>>>>> like you shouldn't need to do anything special here.
>>>>
>>>> Unfortunately IMA does not notice the changes. With the patch provided in the other email IMA works as expected.
>>>>
>>>
>>>
>>> It looks like remeasurement is usually done in ima_check_last_writer.
>>> That gets called from __fput which is called when we're releasing the
>>> last reference to the struct file.
>>>
>>> You've hooked into the ->release op, which gets called whenever
>>> filp_close is called, which happens when we're disassociating the file
>>> from the file descriptor table.
>>>
>>> So...I don't get it. Is ima_file_free not getting called on your file
>>> for some reason when you go to close it? It seems like that should be
>>> handling this.
>>
>> I would ditch the original proposal in favor of this 2-line patch shown here:
>>
>> https://lore.kernel.org/linux-integrity/a95f62ed-8b8a-38e5-e468-ecbde3b221af@linux.ibm.com/T/#m3bd047c6e5c8200df1d273c0ad551c645dd43232
>>
>>
> 
> Ok, I think I get it. IMA is trying to use the i_version from the
> overlayfs inode.
> 
> I suspect that the real problem here is that IMA is just doing a bare
> inode_query_iversion. Really, we ought to make IMA call
> vfs_getattr_nosec (or something like it) to query the getattr routine in
> the upper layer. Then overlayfs could just propagate the results from
> the upper layer in its response.

You mean compare known stat against current ? It seems more expensive to stat the file
rather than using the simple i_version-has-changed indicator.

> 
> That sort of design may also eventually help IMA work properly with more
> exotic filesystems, like NFS or Ceph.

And these don't support i_version at all?

    Stefan

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-06 21:24                   ` Jeff Layton
  2023-04-06 21:58                     ` Stefan Berger
@ 2023-04-06 22:04                     ` Jeff Layton
  2023-04-06 22:27                       ` Stefan Berger
                                         ` (2 more replies)
  1 sibling, 3 replies; 61+ messages in thread
From: Jeff Layton @ 2023-04-06 22:04 UTC (permalink / raw)
  To: Stefan Berger, Christian Brauner, Paul Moore
  Cc: zohar, linux-integrity, miklos, linux-kernel,
	linux-security-module, linux-fsdevel, linux-unionfs, amir73il

On Thu, 2023-04-06 at 17:24 -0400, Jeff Layton wrote:
> On Thu, 2023-04-06 at 16:22 -0400, Stefan Berger wrote:
> > 
> > On 4/6/23 15:37, Jeff Layton wrote:
> > > On Thu, 2023-04-06 at 15:11 -0400, Stefan Berger wrote:
> > > > 
> > > > On 4/6/23 14:46, Jeff Layton wrote:
> > > > > On Thu, 2023-04-06 at 17:01 +0200, Christian Brauner wrote:
> > > > > > On Thu, Apr 06, 2023 at 10:36:41AM -0400, Paul Moore wrote:
> > > > 
> > > > > 
> > > > > Correct. As long as IMA is also measuring the upper inode then it seems
> > > > > like you shouldn't need to do anything special here.
> > > > 
> > > > Unfortunately IMA does not notice the changes. With the patch provided in the other email IMA works as expected.
> > > > 
> > > 
> > > 
> > > It looks like remeasurement is usually done in ima_check_last_writer.
> > > That gets called from __fput which is called when we're releasing the
> > > last reference to the struct file.
> > > 
> > > You've hooked into the ->release op, which gets called whenever
> > > filp_close is called, which happens when we're disassociating the file
> > > from the file descriptor table.
> > > 
> > > So...I don't get it. Is ima_file_free not getting called on your file
> > > for some reason when you go to close it? It seems like that should be
> > > handling this.
> > 
> > I would ditch the original proposal in favor of this 2-line patch shown here:
> > 
> > https://lore.kernel.org/linux-integrity/a95f62ed-8b8a-38e5-e468-ecbde3b221af@linux.ibm.com/T/#m3bd047c6e5c8200df1d273c0ad551c645dd43232
> > 
> > 
> 
> Ok, I think I get it. IMA is trying to use the i_version from the
> overlayfs inode.
> 
> I suspect that the real problem here is that IMA is just doing a bare
> inode_query_iversion. Really, we ought to make IMA call
> vfs_getattr_nosec (or something like it) to query the getattr routine in
> the upper layer. Then overlayfs could just propagate the results from
> the upper layer in its response.
> 
> That sort of design may also eventually help IMA work properly with more
> exotic filesystems, like NFS or Ceph.
> 
> 
> 

Maybe something like this? It builds for me but I haven't tested it. It
looks like overlayfs already should report the upper layer's i_version
in getattr, though I haven't tested that either:

-----------------------8<---------------------------

[PATCH] IMA: use vfs_getattr_nosec to get the i_version

IMA currently accesses the i_version out of the inode directly when it
does a measurement. This is fine for most simple filesystems, but can be
problematic with more complex setups (e.g. overlayfs).

Make IMA instead call vfs_getattr_nosec to get this info. This allows
the filesystem to determine whether and how to report the i_version, and
should allow IMA to work properly with a broader class of filesystems in
the future.

Reported-by: Stefan Berger <stefanb@linux.ibm.com>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 security/integrity/ima/ima_api.c  |  9 ++++++---
 security/integrity/ima/ima_main.c | 12 ++++++++----
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c
index d3662f4acadc..c45902e72044 100644
--- a/security/integrity/ima/ima_api.c
+++ b/security/integrity/ima/ima_api.c
@@ -13,7 +13,6 @@
 #include <linux/fs.h>
 #include <linux/xattr.h>
 #include <linux/evm.h>
-#include <linux/iversion.h>
 #include <linux/fsverity.h>
 
 #include "ima.h"
@@ -246,10 +245,11 @@ int ima_collect_measurement(struct integrity_iint_cache *iint,
 	struct inode *inode = file_inode(file);
 	const char *filename = file->f_path.dentry->d_name.name;
 	struct ima_max_digest_data hash;
+	struct kstat stat;
 	int result = 0;
 	int length;
 	void *tmpbuf;
-	u64 i_version;
+	u64 i_version = 0;
 
 	/*
 	 * Always collect the modsig, because IMA might have already collected
@@ -268,7 +268,10 @@ int ima_collect_measurement(struct integrity_iint_cache *iint,
 	 * to an initial measurement/appraisal/audit, but was modified to
 	 * assume the file changed.
 	 */
-	i_version = inode_query_iversion(inode);
+	result = vfs_getattr_nosec(&file->f_path, &stat, STATX_CHANGE_COOKIE,
+				   AT_STATX_SYNC_AS_STAT);
+	if (!result && (stat.result_mask & STATX_CHANGE_COOKIE))
+		i_version = stat.change_cookie;
 	hash.hdr.algo = algo;
 	hash.hdr.length = hash_digest_size[algo];
 
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index d66a0a36415e..365db0e43d7c 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -24,7 +24,6 @@
 #include <linux/slab.h>
 #include <linux/xattr.h>
 #include <linux/ima.h>
-#include <linux/iversion.h>
 #include <linux/fs.h>
 
 #include "ima.h"
@@ -164,11 +163,16 @@ static void ima_check_last_writer(struct integrity_iint_cache *iint,
 
 	mutex_lock(&iint->mutex);
 	if (atomic_read(&inode->i_writecount) == 1) {
+		struct kstat stat;
+
 		update = test_and_clear_bit(IMA_UPDATE_XATTR,
 					    &iint->atomic_flags);
-		if (!IS_I_VERSION(inode) ||
-		    !inode_eq_iversion(inode, iint->version) ||
-		    (iint->flags & IMA_NEW_FILE)) {
+		if ((iint->flags & IMA_NEW_FILE) ||
+		    vfs_getattr_nosec(&file->f_path, &stat,
+				      STATX_CHANGE_COOKIE,
+				      AT_STATX_SYNC_AS_STAT) ||
+		    !(stat.result_mask & STATX_CHANGE_COOKIE) ||
+		    stat.change_cookie != iint->version) {
 			iint->flags &= ~(IMA_DONE_MASK | IMA_NEW_FILE);
 			iint->measured_pcrs = 0;
 			if (update)
-- 
2.39.2



^ permalink raw reply related	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-06 21:58                     ` Stefan Berger
@ 2023-04-06 22:09                       ` Jeff Layton
  0 siblings, 0 replies; 61+ messages in thread
From: Jeff Layton @ 2023-04-06 22:09 UTC (permalink / raw)
  To: Stefan Berger, Christian Brauner, Paul Moore
  Cc: zohar, linux-integrity, miklos, linux-kernel,
	linux-security-module, linux-fsdevel, linux-unionfs, amir73il

On Thu, 2023-04-06 at 17:58 -0400, Stefan Berger wrote:
> 
> On 4/6/23 17:24, Jeff Layton wrote:
> > On Thu, 2023-04-06 at 16:22 -0400, Stefan Berger wrote:
> > > 
> > > On 4/6/23 15:37, Jeff Layton wrote:
> > > > On Thu, 2023-04-06 at 15:11 -0400, Stefan Berger wrote:
> > > > > 
> > > > > On 4/6/23 14:46, Jeff Layton wrote:
> > > > > > On Thu, 2023-04-06 at 17:01 +0200, Christian Brauner wrote:
> > > > > > > On Thu, Apr 06, 2023 at 10:36:41AM -0400, Paul Moore wrote:
> > > > > 
> > > > > > 
> > > > > > Correct. As long as IMA is also measuring the upper inode then it seems
> > > > > > like you shouldn't need to do anything special here.
> > > > > 
> > > > > Unfortunately IMA does not notice the changes. With the patch provided in the other email IMA works as expected.
> > > > > 
> > > > 
> > > > 
> > > > It looks like remeasurement is usually done in ima_check_last_writer.
> > > > That gets called from __fput which is called when we're releasing the
> > > > last reference to the struct file.
> > > > 
> > > > You've hooked into the ->release op, which gets called whenever
> > > > filp_close is called, which happens when we're disassociating the file
> > > > from the file descriptor table.
> > > > 
> > > > So...I don't get it. Is ima_file_free not getting called on your file
> > > > for some reason when you go to close it? It seems like that should be
> > > > handling this.
> > > 
> > > I would ditch the original proposal in favor of this 2-line patch shown here:
> > > 
> > > https://lore.kernel.org/linux-integrity/a95f62ed-8b8a-38e5-e468-ecbde3b221af@linux.ibm.com/T/#m3bd047c6e5c8200df1d273c0ad551c645dd43232
> > > 
> > > 
> > 
> > Ok, I think I get it. IMA is trying to use the i_version from the
> > overlayfs inode.
> > 
> > I suspect that the real problem here is that IMA is just doing a bare
> > inode_query_iversion. Really, we ought to make IMA call
> > vfs_getattr_nosec (or something like it) to query the getattr routine in
> > the upper layer. Then overlayfs could just propagate the results from
> > the upper layer in its response.
> 
> You mean compare known stat against current ? It seems more expensive to stat the file
> rather than using the simple i_version-has-changed indicator.
> 

getattr is fairly cheap on a local filesystem. It's more expensive with
something networked, but that's the price of correctness.

> > That sort of design may also eventually help IMA work properly with more
> > exotic filesystems, like NFS or Ceph.
> 
> And these don't support i_version at all?

They absolutely do. Their change attributes are mediated by the server,
so they can't use the kernel's mechanism for IS_I_VERSION inodes. They
can report that field in their ->getattr routines however.

-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-06 22:04                     ` Jeff Layton
@ 2023-04-06 22:27                       ` Stefan Berger
  2023-04-07  8:31                       ` Christian Brauner
  2023-04-17 14:07                       ` Stefan Berger
  2 siblings, 0 replies; 61+ messages in thread
From: Stefan Berger @ 2023-04-06 22:27 UTC (permalink / raw)
  To: Jeff Layton, Christian Brauner, Paul Moore
  Cc: zohar, linux-integrity, miklos, linux-kernel,
	linux-security-module, linux-fsdevel, linux-unionfs, amir73il



On 4/6/23 18:04, Jeff Layton wrote:
> On Thu, 2023-04-06 at 17:24 -0400, Jeff Layton wrote:
>> On Thu, 2023-04-06 at 16:22 -0400, Stefan Berger wrote:
>>>
>>> On 4/6/23 15:37, Jeff Layton wrote:
>>>> On Thu, 2023-04-06 at 15:11 -0400, Stefan Berger wrote:
>>>>>
>>>>> On 4/6/23 14:46, Jeff Layton wrote:
>>>>>> On Thu, 2023-04-06 at 17:01 +0200, Christian Brauner wrote:
>>>>>>> On Thu, Apr 06, 2023 at 10:36:41AM -0400, Paul Moore wrote:
>>>>>
>>>>>>
>>>>>> Correct. As long as IMA is also measuring the upper inode then it seems
>>>>>> like you shouldn't need to do anything special here.
>>>>>
>>>>> Unfortunately IMA does not notice the changes. With the patch provided in the other email IMA works as expected.
>>>>>
>>>>
>>>>
>>>> It looks like remeasurement is usually done in ima_check_last_writer.
>>>> That gets called from __fput which is called when we're releasing the
>>>> last reference to the struct file.
>>>>
>>>> You've hooked into the ->release op, which gets called whenever
>>>> filp_close is called, which happens when we're disassociating the file
>>>> from the file descriptor table.
>>>>
>>>> So...I don't get it. Is ima_file_free not getting called on your file
>>>> for some reason when you go to close it? It seems like that should be
>>>> handling this.
>>>
>>> I would ditch the original proposal in favor of this 2-line patch shown here:
>>>
>>> https://lore.kernel.org/linux-integrity/a95f62ed-8b8a-38e5-e468-ecbde3b221af@linux.ibm.com/T/#m3bd047c6e5c8200df1d273c0ad551c645dd43232
>>>
>>>
>>
>> Ok, I think I get it. IMA is trying to use the i_version from the
>> overlayfs inode.
>>
>> I suspect that the real problem here is that IMA is just doing a bare
>> inode_query_iversion. Really, we ought to make IMA call
>> vfs_getattr_nosec (or something like it) to query the getattr routine in
>> the upper layer. Then overlayfs could just propagate the results from
>> the upper layer in its response.
>>
>> That sort of design may also eventually help IMA work properly with more
>> exotic filesystems, like NFS or Ceph.
>>
>>
>>
> 
> Maybe something like this? It builds for me but I haven't tested it. It
> looks like overlayfs already should report the upper layer's i_version
> in getattr, though I haven't tested that either:


Thank you! I will give it a try once I am back.

     Stefan

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-06 20:22                 ` Stefan Berger
  2023-04-06 21:24                   ` Jeff Layton
@ 2023-04-07  6:42                   ` Amir Goldstein
  1 sibling, 0 replies; 61+ messages in thread
From: Amir Goldstein @ 2023-04-07  6:42 UTC (permalink / raw)
  To: Stefan Berger
  Cc: Jeff Layton, Christian Brauner, Paul Moore, zohar,
	linux-integrity, miklos, linux-kernel, linux-security-module,
	linux-fsdevel, linux-unionfs

On Thu, Apr 6, 2023 at 11:23 PM Stefan Berger <stefanb@linux.ibm.com> wrote:
>
>
>
> On 4/6/23 15:37, Jeff Layton wrote:
> > On Thu, 2023-04-06 at 15:11 -0400, Stefan Berger wrote:
> >>
> >> On 4/6/23 14:46, Jeff Layton wrote:
> >>> On Thu, 2023-04-06 at 17:01 +0200, Christian Brauner wrote:
> >>>> On Thu, Apr 06, 2023 at 10:36:41AM -0400, Paul Moore wrote:
> >>
> >>>
> >>> Correct. As long as IMA is also measuring the upper inode then it seems
> >>> like you shouldn't need to do anything special here.
> >>
> >> Unfortunately IMA does not notice the changes. With the patch provided in the other email IMA works as expected.
> >>
> >
> >
> > It looks like remeasurement is usually done in ima_check_last_writer.
> > That gets called from __fput which is called when we're releasing the
> > last reference to the struct file.
> >
> > You've hooked into the ->release op, which gets called whenever
> > filp_close is called, which happens when we're disassociating the file
> > from the file descriptor table.
> >
> > So...I don't get it. Is ima_file_free not getting called on your file
> > for some reason when you go to close it? It seems like that should be
> > handling this.
>
> I would ditch the original proposal in favor of this 2-line patch shown here:
>
> https://lore.kernel.org/linux-integrity/a95f62ed-8b8a-38e5-e468-ecbde3b221af@linux.ibm.com/T/#m3bd047c6e5c8200df1d273c0ad551c645dd43232
>
>
> The new proposed i_version increase occurs on the inode that IMA sees later on for
> the file that's being executed and on which it must do a re-evaluation.
>
> Upon file changes ima_inode_free() seems to see two ima_file_free() calls,
> one for what seems to be the upper layer (used for vfs_* functions below)
> and once for the lower one.
> The important thing is that IMA will see the lower one when the file gets
> executed later on and this is the one that I instrumented now to have its
> i_version increased, which in turn triggers the re-evaluation of the file post
> modification.
>
> static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
> [...]
>         struct fd real;
> [...]
>         ret = ovl_real_fdget(file, &real);
>         if (ret)
>                 goto out_unlock;
>
> [...]
>         if (is_sync_kiocb(iocb)) {
>                 file_start_write(real.file);
> -->             ret = vfs_iter_write(real.file, iter, &iocb->ki_pos,
>                                      ovl_iocb_to_rwf(ifl));
>                 file_end_write(real.file);
>                 /* Update size */
>                 ovl_copyattr(inode);
>         } else {
>                 struct ovl_aio_req *aio_req;
>
>                 ret = -ENOMEM;
>                 aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL);
>                 if (!aio_req)
>                         goto out;
>
>                 file_start_write(real.file);
>                 /* Pacify lockdep, same trick as done in aio_write() */
>                 __sb_writers_release(file_inode(real.file)->i_sb,
>                                      SB_FREEZE_WRITE);
>                 aio_req->fd = real;
>                 real.flags = 0;
>                 aio_req->orig_iocb = iocb;
>                 kiocb_clone(&aio_req->iocb, iocb, real.file);
>                 aio_req->iocb.ki_flags = ifl;
>                 aio_req->iocb.ki_complete = ovl_aio_rw_complete;
>                 refcount_set(&aio_req->ref, 2);
> -->             ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter);
>                 ovl_aio_put(aio_req);
>                 if (ret != -EIOCBQUEUED)
>                         ovl_aio_cleanup_handler(aio_req);
>         }
>          if (ret > 0)                                           <--- this get it to work
>                  inode_maybe_inc_iversion(inode, false);                <--- since this inode is known to IMA

If the aio is queued, then I think increasing i_version here may be premature.

Note that in this code flow, the ovl ctime is updated in
ovl_aio_cleanup_handler() => ovl_copyattr()
after file_end_write(), similar to the is_sync_kiocb() code patch.

It probably makes most sense to include i_version in ovl_copyattr().
Note that this could cause ovl i_version to go backwards on copy up
(i.e. after first open for write) when switching from the lower inode
i_version to the upper inode i_version.

Jeff's proposal to use vfs_getattr_nosec() in IMA code is fine too.
It will result in the same i_version jump.

IMO it wouldn't hurt to have a valid i_version value in the ovl inode
as well. If the ovl inode values would not matter, we would not have
needed  ovl_copyattr() at all, but it's not good to keep vfs in the dark...

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-06 22:04                     ` Jeff Layton
  2023-04-06 22:27                       ` Stefan Berger
@ 2023-04-07  8:31                       ` Christian Brauner
  2023-04-07 13:29                         ` Jeff Layton
  2023-05-19 19:42                         ` Mimi Zohar
  2023-04-17 14:07                       ` Stefan Berger
  2 siblings, 2 replies; 61+ messages in thread
From: Christian Brauner @ 2023-04-07  8:31 UTC (permalink / raw)
  To: Amir Goldstein, Jeff Layton
  Cc: Stefan Berger, Paul Moore, zohar, linux-integrity, miklos,
	linux-kernel, linux-security-module, linux-fsdevel,
	linux-unionfs

On Fri, Apr 07, 2023 at 09:42:43AM +0300, Amir Goldstein wrote:
> On Thu, Apr 6, 2023 at 11:23 PM Stefan Berger <stefanb@linux.ibm.com> wrote:
> >
> >
> >
> > On 4/6/23 15:37, Jeff Layton wrote:
> > > On Thu, 2023-04-06 at 15:11 -0400, Stefan Berger wrote:
> > >>
> > >> On 4/6/23 14:46, Jeff Layton wrote:
> > >>> On Thu, 2023-04-06 at 17:01 +0200, Christian Brauner wrote:
> > >>>> On Thu, Apr 06, 2023 at 10:36:41AM -0400, Paul Moore wrote:
> > >>
> > >>>
> > >>> Correct. As long as IMA is also measuring the upper inode then it seems
> > >>> like you shouldn't need to do anything special here.
> > >>
> > >> Unfortunately IMA does not notice the changes. With the patch provided in the other email IMA works as expected.
> > >>
> > >
> > >
> > > It looks like remeasurement is usually done in ima_check_last_writer.
> > > That gets called from __fput which is called when we're releasing the
> > > last reference to the struct file.
> > >
> > > You've hooked into the ->release op, which gets called whenever
> > > filp_close is called, which happens when we're disassociating the file
> > > from the file descriptor table.
> > >
> > > So...I don't get it. Is ima_file_free not getting called on your file
> > > for some reason when you go to close it? It seems like that should be
> > > handling this.
> >
> > I would ditch the original proposal in favor of this 2-line patch shown here:
> >
> > https://lore.kernel.org/linux-integrity/a95f62ed-8b8a-38e5-e468-ecbde3b221af@linux.ibm.com/T/#m3bd047c6e5c8200df1d273c0ad551c645dd43232
> >
> >
> > The new proposed i_version increase occurs on the inode that IMA sees later on for
> > the file that's being executed and on which it must do a re-evaluation.
> >
> > Upon file changes ima_inode_free() seems to see two ima_file_free() calls,
> > one for what seems to be the upper layer (used for vfs_* functions below)
> > and once for the lower one.
> > The important thing is that IMA will see the lower one when the file gets
> > executed later on and this is the one that I instrumented now to have its
> > i_version increased, which in turn triggers the re-evaluation of the file post
> > modification.
> >
> > static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
> > [...]
> >         struct fd real;
> > [...]
> >         ret = ovl_real_fdget(file, &real);
> >         if (ret)
> >                 goto out_unlock;
> >
> > [...]
> >         if (is_sync_kiocb(iocb)) {
> >                 file_start_write(real.file);
> > -->             ret = vfs_iter_write(real.file, iter, &iocb->ki_pos,
> >                                      ovl_iocb_to_rwf(ifl));
> >                 file_end_write(real.file);
> >                 /* Update size */
> >                 ovl_copyattr(inode);
> >         } else {
> >                 struct ovl_aio_req *aio_req;
> >
> >                 ret = -ENOMEM;
> >                 aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL);
> >                 if (!aio_req)
> >                         goto out;
> >
> >                 file_start_write(real.file);
> >                 /* Pacify lockdep, same trick as done in aio_write() */
> >                 __sb_writers_release(file_inode(real.file)->i_sb,
> >                                      SB_FREEZE_WRITE);
> >                 aio_req->fd = real;
> >                 real.flags = 0;
> >                 aio_req->orig_iocb = iocb;
> >                 kiocb_clone(&aio_req->iocb, iocb, real.file);
> >                 aio_req->iocb.ki_flags = ifl;
> >                 aio_req->iocb.ki_complete = ovl_aio_rw_complete;
> >                 refcount_set(&aio_req->ref, 2);
> > -->             ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter);
> >                 ovl_aio_put(aio_req);
> >                 if (ret != -EIOCBQUEUED)
> >                         ovl_aio_cleanup_handler(aio_req);
> >         }
> >          if (ret > 0)                                           <--- this get it to work
> >                  inode_maybe_inc_iversion(inode, false);                <--- since this inode is known to IMA
> 
> If the aio is queued, then I think increasing i_version here may be premature.
> 
> Note that in this code flow, the ovl ctime is updated in
> ovl_aio_cleanup_handler() => ovl_copyattr()
> after file_end_write(), similar to the is_sync_kiocb() code patch.
> 
> It probably makes most sense to include i_version in ovl_copyattr().
> Note that this could cause ovl i_version to go backwards on copy up
> (i.e. after first open for write) when switching from the lower inode
> i_version to the upper inode i_version.
> 
> Jeff's proposal to use vfs_getattr_nosec() in IMA code is fine too.
> It will result in the same i_version jump.
> 
> IMO it wouldn't hurt to have a valid i_version value in the ovl inode
> as well. If the ovl inode values would not matter, we would not have
> needed  ovl_copyattr() at all, but it's not good to keep vfs in the dark...
> 
> Thanks,
> Amir.

On Thu, Apr 06, 2023 at 05:24:11PM -0400, Jeff Layton wrote:
> On Thu, 2023-04-06 at 16:22 -0400, Stefan Berger wrote:
> > 
> > On 4/6/23 15:37, Jeff Layton wrote:
> > > On Thu, 2023-04-06 at 15:11 -0400, Stefan Berger wrote:
> > > > 
> > > > On 4/6/23 14:46, Jeff Layton wrote:
> > > > > On Thu, 2023-04-06 at 17:01 +0200, Christian Brauner wrote:
> > > > > > On Thu, Apr 06, 2023 at 10:36:41AM -0400, Paul Moore wrote:
> > > > 
> > > > > 
> > > > > Correct. As long as IMA is also measuring the upper inode then it seems
> > > > > like you shouldn't need to do anything special here.
> > > > 
> > > > Unfortunately IMA does not notice the changes. With the patch provided in the other email IMA works as expected.
> > > > 
> > > 
> > > 
> > > It looks like remeasurement is usually done in ima_check_last_writer.
> > > That gets called from __fput which is called when we're releasing the
> > > last reference to the struct file.
> > > 
> > > You've hooked into the ->release op, which gets called whenever
> > > filp_close is called, which happens when we're disassociating the file
> > > from the file descriptor table.
> > > 
> > > So...I don't get it. Is ima_file_free not getting called on your file
> > > for some reason when you go to close it? It seems like that should be
> > > handling this.
> > 
> > I would ditch the original proposal in favor of this 2-line patch shown here:
> > 
> > https://lore.kernel.org/linux-integrity/a95f62ed-8b8a-38e5-e468-ecbde3b221af@linux.ibm.com/T/#m3bd047c6e5c8200df1d273c0ad551c645dd43232
> > 
> > 
> 
> Ok, I think I get it. IMA is trying to use the i_version from the
> overlayfs inode.

Which is likely to give wrong results and I agree with you that it
should rely on vfs_getattr_nosec().

> 
> I suspect that the real problem here is that IMA is just doing a bare
> inode_query_iversion. Really, we ought to make IMA call
> vfs_getattr_nosec (or something like it) to query the getattr routine in
> the upper layer. Then overlayfs could just propagate the results from
> the upper layer in its response.
> 
> That sort of design may also eventually help IMA work properly with more
> exotic filesystems, like NFS or Ceph.
> 
> > The new proposed i_version increase occurs on the inode that IMA sees later on for
> > the file that's being executed and on which it must do a re-evaluation.
> > 
> > Upon file changes ima_inode_free() seems to see two ima_file_free() calls,
> > one for what seems to be the upper layer (used for vfs_* functions below)
> > and once for the lower one.
> > The important thing is that IMA will see the lower one when the file gets
> > executed later on and this is the one that I instrumented now to have its
> > i_version increased, which in turn triggers the re-evaluation of the file post
> > modification.
> > 
> > static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
> > [...]
> > 	struct fd real;
> > [...]
> > 	ret = ovl_real_fdget(file, &real);
> > 	if (ret)
> > 		goto out_unlock;
> > 
> > [...]
> > 	if (is_sync_kiocb(iocb)) {
> > 		file_start_write(real.file);
> > -->		ret = vfs_iter_write(real.file, iter, &iocb->ki_pos,
> > 				     ovl_iocb_to_rwf(ifl));
> > 		file_end_write(real.file);
> > 		/* Update size */
> > 		ovl_copyattr(inode);
> > 	} else {
> > 		struct ovl_aio_req *aio_req;
> > 
> > 		ret = -ENOMEM;
> > 		aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL);
> > 		if (!aio_req)
> > 			goto out;
> > 
> > 		file_start_write(real.file);
> > 		/* Pacify lockdep, same trick as done in aio_write() */
> > 		__sb_writers_release(file_inode(real.file)->i_sb,
> > 				     SB_FREEZE_WRITE);
> > 		aio_req->fd = real;
> > 		real.flags = 0;
> > 		aio_req->orig_iocb = iocb;
> > 		kiocb_clone(&aio_req->iocb, iocb, real.file);
> > 		aio_req->iocb.ki_flags = ifl;
> > 		aio_req->iocb.ki_complete = ovl_aio_rw_complete;
> > 		refcount_set(&aio_req->ref, 2);
> > -->		ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter);
> > 		ovl_aio_put(aio_req);
> > 		if (ret != -EIOCBQUEUED)
> > 			ovl_aio_cleanup_handler(aio_req);
> > 	}
> >          if (ret > 0)						<--- this get it to work
> >                  inode_maybe_inc_iversion(inode, false);		<--- since this inode is known to IMA
> > out:
> >          revert_creds(old_cred);
> > out_fdput:
> >          fdput(real);
> > 
> > out_unlock:
> >          inode_unlock(inode);
> > 
> > 
> > 
> > 
> >     Stefan
> > 
> > > 
> > > In any case, I think this could use a bit more root-cause analysis.
> > 
> 
> -- 
> Jeff Layton <jlayton@kernel.org>

On Thu, Apr 06, 2023 at 06:04:36PM -0400, Jeff Layton wrote:
> On Thu, 2023-04-06 at 17:24 -0400, Jeff Layton wrote:
> > On Thu, 2023-04-06 at 16:22 -0400, Stefan Berger wrote:
> > > 
> > > On 4/6/23 15:37, Jeff Layton wrote:
> > > > On Thu, 2023-04-06 at 15:11 -0400, Stefan Berger wrote:
> > > > > 
> > > > > On 4/6/23 14:46, Jeff Layton wrote:
> > > > > > On Thu, 2023-04-06 at 17:01 +0200, Christian Brauner wrote:
> > > > > > > On Thu, Apr 06, 2023 at 10:36:41AM -0400, Paul Moore wrote:
> > > > > 
> > > > > > 
> > > > > > Correct. As long as IMA is also measuring the upper inode then it seems
> > > > > > like you shouldn't need to do anything special here.
> > > > > 
> > > > > Unfortunately IMA does not notice the changes. With the patch provided in the other email IMA works as expected.
> > > > > 
> > > > 
> > > > 
> > > > It looks like remeasurement is usually done in ima_check_last_writer.
> > > > That gets called from __fput which is called when we're releasing the
> > > > last reference to the struct file.
> > > > 
> > > > You've hooked into the ->release op, which gets called whenever
> > > > filp_close is called, which happens when we're disassociating the file
> > > > from the file descriptor table.
> > > > 
> > > > So...I don't get it. Is ima_file_free not getting called on your file
> > > > for some reason when you go to close it? It seems like that should be
> > > > handling this.
> > > 
> > > I would ditch the original proposal in favor of this 2-line patch shown here:
> > > 
> > > https://lore.kernel.org/linux-integrity/a95f62ed-8b8a-38e5-e468-ecbde3b221af@linux.ibm.com/T/#m3bd047c6e5c8200df1d273c0ad551c645dd43232

We should cool it with the quick hacks to fix things. :)

> > > 
> > > 
> > 
> > Ok, I think I get it. IMA is trying to use the i_version from the
> > overlayfs inode.
> > 
> > I suspect that the real problem here is that IMA is just doing a bare
> > inode_query_iversion. Really, we ought to make IMA call
> > vfs_getattr_nosec (or something like it) to query the getattr routine in
> > the upper layer. Then overlayfs could just propagate the results from
> > the upper layer in its response.
> > 
> > That sort of design may also eventually help IMA work properly with more
> > exotic filesystems, like NFS or Ceph.
> > 
> > 
> > 
> 
> Maybe something like this? It builds for me but I haven't tested it. It
> looks like overlayfs already should report the upper layer's i_version
> in getattr, though I haven't tested that either:
> 
> -----------------------8<---------------------------
> 
> [PATCH] IMA: use vfs_getattr_nosec to get the i_version
> 
> IMA currently accesses the i_version out of the inode directly when it
> does a measurement. This is fine for most simple filesystems, but can be
> problematic with more complex setups (e.g. overlayfs).
> 
> Make IMA instead call vfs_getattr_nosec to get this info. This allows
> the filesystem to determine whether and how to report the i_version, and
> should allow IMA to work properly with a broader class of filesystems in
> the future.
> 
> Reported-by: Stefan Berger <stefanb@linux.ibm.com>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---

So, I think we want both; we want the ovl_copyattr() and the
vfs_getattr_nosec() change:

(1) overlayfs should copy up the inode version in ovl_copyattr(). That
    is in line what we do with all other inode attributes. IOW, the
    overlayfs inode's i_version counter should aim to mirror the
    relevant layer's i_version counter. I wouldn't know why that
    shouldn't be the case. Asking the other way around there doesn't
    seem to be any use for overlayfs inodes to have an i_version that
    isn't just mirroring the relevant layer's i_version.
(2) Jeff's changes for ima to make it rely on vfs_getattr_nosec().
    Currently, ima assumes that it will get the correct i_version from
    an inode but that just doesn't hold for stacking filesystem.

While (1) would likely just fix the immediate bug (2) is correct and
_robust_. If we change how attributes are handled vfs_*() helpers will
get updated and ima with it. Poking at raw inodes without using
appropriate helpers is much more likely to get ima into trouble.

Christian

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-07  8:31                       ` Christian Brauner
@ 2023-04-07 13:29                         ` Jeff Layton
  2023-04-09 15:22                           ` Christian Brauner
                                             ` (2 more replies)
  2023-05-19 19:42                         ` Mimi Zohar
  1 sibling, 3 replies; 61+ messages in thread
From: Jeff Layton @ 2023-04-07 13:29 UTC (permalink / raw)
  To: Christian Brauner, Amir Goldstein
  Cc: Stefan Berger, Paul Moore, zohar, linux-integrity, miklos,
	linux-kernel, linux-security-module, linux-fsdevel,
	linux-unionfs

> > > > 
> > > > I would ditch the original proposal in favor of this 2-line patch shown here:
> > > > 
> > > > https://lore.kernel.org/linux-integrity/a95f62ed-8b8a-38e5-e468-ecbde3b221af@linux.ibm.com/T/#m3bd047c6e5c8200df1d273c0ad551c645dd43232
> 
> We should cool it with the quick hacks to fix things. :)
> 

Yeah. It might fix this specific testcase, but I think the way it uses
the i_version is "gameable" in other situations. Then again, I don't
know a lot about IMA in this regard.

When is it expected to remeasure? If it's only expected to remeasure on
a close(), then that's one thing. That would be a weird design though.

> > > > 
> > > > 
> > > 
> > > Ok, I think I get it. IMA is trying to use the i_version from the
> > > overlayfs inode.
> > > 
> > > I suspect that the real problem here is that IMA is just doing a bare
> > > inode_query_iversion. Really, we ought to make IMA call
> > > vfs_getattr_nosec (or something like it) to query the getattr routine in
> > > the upper layer. Then overlayfs could just propagate the results from
> > > the upper layer in its response.
> > > 
> > > That sort of design may also eventually help IMA work properly with more
> > > exotic filesystems, like NFS or Ceph.
> > > 
> > > 
> > > 
> > 
> > Maybe something like this? It builds for me but I haven't tested it. It
> > looks like overlayfs already should report the upper layer's i_version
> > in getattr, though I haven't tested that either:
> > 
> > -----------------------8<---------------------------
> > 
> > [PATCH] IMA: use vfs_getattr_nosec to get the i_version
> > 
> > IMA currently accesses the i_version out of the inode directly when it
> > does a measurement. This is fine for most simple filesystems, but can be
> > problematic with more complex setups (e.g. overlayfs).
> > 
> > Make IMA instead call vfs_getattr_nosec to get this info. This allows
> > the filesystem to determine whether and how to report the i_version, and
> > should allow IMA to work properly with a broader class of filesystems in
> > the future.
> > 
> > Reported-by: Stefan Berger <stefanb@linux.ibm.com>
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > ---
> 
> So, I think we want both; we want the ovl_copyattr() and the
> vfs_getattr_nosec() change:
> 
> (1) overlayfs should copy up the inode version in ovl_copyattr(). That
>     is in line what we do with all other inode attributes. IOW, the
>     overlayfs inode's i_version counter should aim to mirror the
>     relevant layer's i_version counter. I wouldn't know why that
>     shouldn't be the case. Asking the other way around there doesn't
>     seem to be any use for overlayfs inodes to have an i_version that
>     isn't just mirroring the relevant layer's i_version.

It's less than ideal to do this IMO, particularly with an IS_I_VERSION
inode.

You can't just copy up the value from the upper. You'll need to call
inode_query_iversion(upper_inode), which will flag the upper inode for a
logged i_version update on the next write. IOW, this could create some
(probably minor) metadata write amplification in the upper layer inode
with IS_I_VERSION inodes.


> (2) Jeff's changes for ima to make it rely on vfs_getattr_nosec().
>     Currently, ima assumes that it will get the correct i_version from
>     an inode but that just doesn't hold for stacking filesystem.
> 
> While (1) would likely just fix the immediate bug (2) is correct and
> _robust_. If we change how attributes are handled vfs_*() helpers will
> get updated and ima with it. Poking at raw inodes without using
> appropriate helpers is much more likely to get ima into trouble.

This will fix it the right way, I think (assuming it actually works),
and should open the door for IMA to work properly with networked
filesystems that support i_version as well.

Note that there Stephen is correct that calling getattr is probably
going to be less efficient here since we're going to end up calling
generic_fillattr unnecessarily, but I still think it's the right thing
to do.

If it turns out to cause measurable performance regressions though,
maybe we can look at adding a something that still calls ->getattr if it
exists but only returns the change_cookie value.
-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-07 13:29                         ` Jeff Layton
@ 2023-04-09 15:22                           ` Christian Brauner
  2023-04-09 22:12                             ` Jeff Layton
  2023-04-17  1:57                           ` Stefan Berger
  2023-04-21 14:43                           ` Mimi Zohar
  2 siblings, 1 reply; 61+ messages in thread
From: Christian Brauner @ 2023-04-09 15:22 UTC (permalink / raw)
  To: Jeff Layton
  Cc: Amir Goldstein, Stefan Berger, Paul Moore, zohar,
	linux-integrity, miklos, linux-kernel, linux-security-module,
	linux-fsdevel, linux-unionfs

On Fri, Apr 07, 2023 at 09:29:29AM -0400, Jeff Layton wrote:
> > > > > 
> > > > > I would ditch the original proposal in favor of this 2-line patch shown here:
> > > > > 
> > > > > https://lore.kernel.org/linux-integrity/a95f62ed-8b8a-38e5-e468-ecbde3b221af@linux.ibm.com/T/#m3bd047c6e5c8200df1d273c0ad551c645dd43232
> > 
> > We should cool it with the quick hacks to fix things. :)
> > 
> 
> Yeah. It might fix this specific testcase, but I think the way it uses
> the i_version is "gameable" in other situations. Then again, I don't
> know a lot about IMA in this regard.
> 
> When is it expected to remeasure? If it's only expected to remeasure on
> a close(), then that's one thing. That would be a weird design though.
> 
> > > > > 
> > > > > 
> > > > 
> > > > Ok, I think I get it. IMA is trying to use the i_version from the
> > > > overlayfs inode.
> > > > 
> > > > I suspect that the real problem here is that IMA is just doing a bare
> > > > inode_query_iversion. Really, we ought to make IMA call
> > > > vfs_getattr_nosec (or something like it) to query the getattr routine in
> > > > the upper layer. Then overlayfs could just propagate the results from
> > > > the upper layer in its response.
> > > > 
> > > > That sort of design may also eventually help IMA work properly with more
> > > > exotic filesystems, like NFS or Ceph.
> > > > 
> > > > 
> > > > 
> > > 
> > > Maybe something like this? It builds for me but I haven't tested it. It
> > > looks like overlayfs already should report the upper layer's i_version
> > > in getattr, though I haven't tested that either:
> > > 
> > > -----------------------8<---------------------------
> > > 
> > > [PATCH] IMA: use vfs_getattr_nosec to get the i_version
> > > 
> > > IMA currently accesses the i_version out of the inode directly when it
> > > does a measurement. This is fine for most simple filesystems, but can be
> > > problematic with more complex setups (e.g. overlayfs).
> > > 
> > > Make IMA instead call vfs_getattr_nosec to get this info. This allows
> > > the filesystem to determine whether and how to report the i_version, and
> > > should allow IMA to work properly with a broader class of filesystems in
> > > the future.
> > > 
> > > Reported-by: Stefan Berger <stefanb@linux.ibm.com>
> > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > ---
> > 
> > So, I think we want both; we want the ovl_copyattr() and the
> > vfs_getattr_nosec() change:
> > 
> > (1) overlayfs should copy up the inode version in ovl_copyattr(). That
> >     is in line what we do with all other inode attributes. IOW, the
> >     overlayfs inode's i_version counter should aim to mirror the
> >     relevant layer's i_version counter. I wouldn't know why that
> >     shouldn't be the case. Asking the other way around there doesn't
> >     seem to be any use for overlayfs inodes to have an i_version that
> >     isn't just mirroring the relevant layer's i_version.
> 
> It's less than ideal to do this IMO, particularly with an IS_I_VERSION
> inode.
> 
> You can't just copy up the value from the upper. You'll need to call
> inode_query_iversion(upper_inode), which will flag the upper inode for a
> logged i_version update on the next write. IOW, this could create some
> (probably minor) metadata write amplification in the upper layer inode
> with IS_I_VERSION inodes.

I'm likely just missing context and am curious about this so bear with me. Why
do we need to flag the upper inode for a logged i_version update? Any required
i_version interactions should've already happened when overlayfs called into
the upper layer. So all that's left to do is for overlayfs' to mirror the
i_version value after the upper operation has returned.

ovl_copyattr() - which copies the inode attributes - is always called after the
operation on the upper inode has finished. So the additional query seems odd at
first glance. But there might well be a good reason for it. In my naive
approach I would've thought that sm along the lines of:

diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 923d66d131c1..8b089035b9b3 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -1119,4 +1119,5 @@ void ovl_copyattr(struct inode *inode)
        inode->i_mtime = realinode->i_mtime;
        inode->i_ctime = realinode->i_ctime;
        i_size_write(inode, i_size_read(realinode));
+       inode_set_iversion_raw(inode, inode_peek_iversion_raw(realinode));
 }

would've been sufficient.

Since overlayfs' does explicitly disallow changes to the upper and lower trees
while overlayfs is mounted it seems intuitive that it should just mirror the
relevant layer's i_version.

If we don't do this, then we should probably document that i_version doesn't
have a meaning yet for the inodes of stacking filesystems.

> 
> 
> > (2) Jeff's changes for ima to make it rely on vfs_getattr_nosec().
> >     Currently, ima assumes that it will get the correct i_version from
> >     an inode but that just doesn't hold for stacking filesystem.
> > 
> > While (1) would likely just fix the immediate bug (2) is correct and
> > _robust_. If we change how attributes are handled vfs_*() helpers will
> > get updated and ima with it. Poking at raw inodes without using
> > appropriate helpers is much more likely to get ima into trouble.
> 
> This will fix it the right way, I think (assuming it actually works),
> and should open the door for IMA to work properly with networked
> filesystems that support i_version as well.
> 
> Note that there Stephen is correct that calling getattr is probably
> going to be less efficient here since we're going to end up calling
> generic_fillattr unnecessarily, but I still think it's the right thing
> to do.
> 
> If it turns out to cause measurable performance regressions though,
> maybe we can look at adding a something that still calls ->getattr if it
> exists but only returns the change_cookie value.

Sounds good to me.

^ permalink raw reply related	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-09 15:22                           ` Christian Brauner
@ 2023-04-09 22:12                             ` Jeff Layton
  2023-04-11  8:38                               ` Christian Brauner
  0 siblings, 1 reply; 61+ messages in thread
From: Jeff Layton @ 2023-04-09 22:12 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Amir Goldstein, Stefan Berger, Paul Moore, zohar,
	linux-integrity, miklos, linux-kernel, linux-security-module,
	linux-fsdevel, linux-unionfs

On Sun, 2023-04-09 at 17:22 +0200, Christian Brauner wrote:
> On Fri, Apr 07, 2023 at 09:29:29AM -0400, Jeff Layton wrote:
> > > > > > 
> > > > > > I would ditch the original proposal in favor of this 2-line patch shown here:
> > > > > > 
> > > > > > https://lore.kernel.org/linux-integrity/a95f62ed-8b8a-38e5-e468-ecbde3b221af@linux.ibm.com/T/#m3bd047c6e5c8200df1d273c0ad551c645dd43232
> > > 
> > > We should cool it with the quick hacks to fix things. :)
> > > 
> > 
> > Yeah. It might fix this specific testcase, but I think the way it uses
> > the i_version is "gameable" in other situations. Then again, I don't
> > know a lot about IMA in this regard.
> > 
> > When is it expected to remeasure? If it's only expected to remeasure on
> > a close(), then that's one thing. That would be a weird design though.
> > 
> > > > > > 
> > > > > > 
> > > > > 
> > > > > Ok, I think I get it. IMA is trying to use the i_version from the
> > > > > overlayfs inode.
> > > > > 
> > > > > I suspect that the real problem here is that IMA is just doing a bare
> > > > > inode_query_iversion. Really, we ought to make IMA call
> > > > > vfs_getattr_nosec (or something like it) to query the getattr routine in
> > > > > the upper layer. Then overlayfs could just propagate the results from
> > > > > the upper layer in its response.
> > > > > 
> > > > > That sort of design may also eventually help IMA work properly with more
> > > > > exotic filesystems, like NFS or Ceph.
> > > > > 
> > > > > 
> > > > > 
> > > > 
> > > > Maybe something like this? It builds for me but I haven't tested it. It
> > > > looks like overlayfs already should report the upper layer's i_version
> > > > in getattr, though I haven't tested that either:
> > > > 
> > > > -----------------------8<---------------------------
> > > > 
> > > > [PATCH] IMA: use vfs_getattr_nosec to get the i_version
> > > > 
> > > > IMA currently accesses the i_version out of the inode directly when it
> > > > does a measurement. This is fine for most simple filesystems, but can be
> > > > problematic with more complex setups (e.g. overlayfs).
> > > > 
> > > > Make IMA instead call vfs_getattr_nosec to get this info. This allows
> > > > the filesystem to determine whether and how to report the i_version, and
> > > > should allow IMA to work properly with a broader class of filesystems in
> > > > the future.
> > > > 
> > > > Reported-by: Stefan Berger <stefanb@linux.ibm.com>
> > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > ---
> > > 
> > > So, I think we want both; we want the ovl_copyattr() and the
> > > vfs_getattr_nosec() change:
> > > 
> > > (1) overlayfs should copy up the inode version in ovl_copyattr(). That
> > >     is in line what we do with all other inode attributes. IOW, the
> > >     overlayfs inode's i_version counter should aim to mirror the
> > >     relevant layer's i_version counter. I wouldn't know why that
> > >     shouldn't be the case. Asking the other way around there doesn't
> > >     seem to be any use for overlayfs inodes to have an i_version that
> > >     isn't just mirroring the relevant layer's i_version.
> > 
> > It's less than ideal to do this IMO, particularly with an IS_I_VERSION
> > inode.
> > 
> > You can't just copy up the value from the upper. You'll need to call
> > inode_query_iversion(upper_inode), which will flag the upper inode for a
> > logged i_version update on the next write. IOW, this could create some
> > (probably minor) metadata write amplification in the upper layer inode
> > with IS_I_VERSION inodes.
> 
> I'm likely just missing context and am curious about this so bear with me. Why
> do we need to flag the upper inode for a logged i_version update? Any required
> i_version interactions should've already happened when overlayfs called into
> the upper layer. So all that's left to do is for overlayfs' to mirror the
> i_version value after the upper operation has returned.

> ovl_copyattr() - which copies the inode attributes - is always called after the
> operation on the upper inode has finished. So the additional query seems odd at
> first glance. But there might well be a good reason for it. In my naive
> approach I would've thought that sm along the lines of:
>
> diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
> index 923d66d131c1..8b089035b9b3 100644
> --- a/fs/overlayfs/util.c
> +++ b/fs/overlayfs/util.c
> @@ -1119,4 +1119,5 @@ void ovl_copyattr(struct inode *inode)
>         inode->i_mtime = realinode->i_mtime;
>         inode->i_ctime = realinode->i_ctime;
>         i_size_write(inode, i_size_read(realinode));
> +       inode_set_iversion_raw(inode, inode_peek_iversion_raw(realinode));
>  }
> 
> would've been sufficient.
> 

Nope, because then you wouldn't get any updates to i_version after that
point.

Note that with an IS_I_VERSION inode we only update the i_version when
there has been a query since the last update. What you're doing above is
circumventing that mechanism. You'll get the i_version at the time of of
the ovl_copyattr, but there won't be any updates of it after that point
because the QUERIED bit won't end up being set on realinode.


> Since overlayfs' does explicitly disallow changes to the upper and lower trees
> while overlayfs is mounted it seems intuitive that it should just mirror the
> relevant layer's i_version.
>
>
> If we don't do this, then we should probably document that i_version doesn't
> have a meaning yet for the inodes of stacking filesystems.
> 

Trying to cache the i_version is counterproductive, IMO, at least with
an IS_I_VERSION inode.

The problem is that a query against the i_version has a side-effect. It
has to (atomically) mark the inode for an update on the next change.

If you try to cache that value, you'll likely end up doing more queries
than you really need to (because you'll need to keep the cache up to
date) and you'll have an i_version that will necessarily lag the one in
the upper layer inode.

The whole point of the change attribute is to get the value as it is at
this very moment so we can check whether there have been changes. A
laggy value is not terribly useful.

Overlayfs should just always call the upper layer's ->getattr to get the
version. I wouldn't even bother copying it up in the first place. Doing
so is just encouraging someone to try use the value in the overlayfs
inode, when they really need to go through ->getattr and get the one
from the upper layer.
> 
-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-09 22:12                             ` Jeff Layton
@ 2023-04-11  8:38                               ` Christian Brauner
  2023-04-11  9:32                                 ` Jeff Layton
  2023-04-21 14:55                                 ` Mimi Zohar
  0 siblings, 2 replies; 61+ messages in thread
From: Christian Brauner @ 2023-04-11  8:38 UTC (permalink / raw)
  To: Jeff Layton
  Cc: Amir Goldstein, Stefan Berger, Paul Moore, zohar,
	linux-integrity, miklos, linux-kernel, linux-security-module,
	linux-fsdevel, linux-unionfs

On Sun, Apr 09, 2023 at 06:12:09PM -0400, Jeff Layton wrote:
> On Sun, 2023-04-09 at 17:22 +0200, Christian Brauner wrote:
> > On Fri, Apr 07, 2023 at 09:29:29AM -0400, Jeff Layton wrote:
> > > > > > > 
> > > > > > > I would ditch the original proposal in favor of this 2-line patch shown here:
> > > > > > > 
> > > > > > > https://lore.kernel.org/linux-integrity/a95f62ed-8b8a-38e5-e468-ecbde3b221af@linux.ibm.com/T/#m3bd047c6e5c8200df1d273c0ad551c645dd43232
> > > > 
> > > > We should cool it with the quick hacks to fix things. :)
> > > > 
> > > 
> > > Yeah. It might fix this specific testcase, but I think the way it uses
> > > the i_version is "gameable" in other situations. Then again, I don't
> > > know a lot about IMA in this regard.
> > > 
> > > When is it expected to remeasure? If it's only expected to remeasure on
> > > a close(), then that's one thing. That would be a weird design though.
> > > 
> > > > > > > 
> > > > > > > 
> > > > > > 
> > > > > > Ok, I think I get it. IMA is trying to use the i_version from the
> > > > > > overlayfs inode.
> > > > > > 
> > > > > > I suspect that the real problem here is that IMA is just doing a bare
> > > > > > inode_query_iversion. Really, we ought to make IMA call
> > > > > > vfs_getattr_nosec (or something like it) to query the getattr routine in
> > > > > > the upper layer. Then overlayfs could just propagate the results from
> > > > > > the upper layer in its response.
> > > > > > 
> > > > > > That sort of design may also eventually help IMA work properly with more
> > > > > > exotic filesystems, like NFS or Ceph.
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > 
> > > > > Maybe something like this? It builds for me but I haven't tested it. It
> > > > > looks like overlayfs already should report the upper layer's i_version
> > > > > in getattr, though I haven't tested that either:
> > > > > 
> > > > > -----------------------8<---------------------------
> > > > > 
> > > > > [PATCH] IMA: use vfs_getattr_nosec to get the i_version
> > > > > 
> > > > > IMA currently accesses the i_version out of the inode directly when it
> > > > > does a measurement. This is fine for most simple filesystems, but can be
> > > > > problematic with more complex setups (e.g. overlayfs).
> > > > > 
> > > > > Make IMA instead call vfs_getattr_nosec to get this info. This allows
> > > > > the filesystem to determine whether and how to report the i_version, and
> > > > > should allow IMA to work properly with a broader class of filesystems in
> > > > > the future.
> > > > > 
> > > > > Reported-by: Stefan Berger <stefanb@linux.ibm.com>
> > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > > ---
> > > > 
> > > > So, I think we want both; we want the ovl_copyattr() and the
> > > > vfs_getattr_nosec() change:
> > > > 
> > > > (1) overlayfs should copy up the inode version in ovl_copyattr(). That
> > > >     is in line what we do with all other inode attributes. IOW, the
> > > >     overlayfs inode's i_version counter should aim to mirror the
> > > >     relevant layer's i_version counter. I wouldn't know why that
> > > >     shouldn't be the case. Asking the other way around there doesn't
> > > >     seem to be any use for overlayfs inodes to have an i_version that
> > > >     isn't just mirroring the relevant layer's i_version.
> > > 
> > > It's less than ideal to do this IMO, particularly with an IS_I_VERSION
> > > inode.
> > > 
> > > You can't just copy up the value from the upper. You'll need to call
> > > inode_query_iversion(upper_inode), which will flag the upper inode for a
> > > logged i_version update on the next write. IOW, this could create some
> > > (probably minor) metadata write amplification in the upper layer inode
> > > with IS_I_VERSION inodes.
> > 
> > I'm likely just missing context and am curious about this so bear with me. Why
> > do we need to flag the upper inode for a logged i_version update? Any required
> > i_version interactions should've already happened when overlayfs called into
> > the upper layer. So all that's left to do is for overlayfs' to mirror the
> > i_version value after the upper operation has returned.
> 
> > ovl_copyattr() - which copies the inode attributes - is always called after the
> > operation on the upper inode has finished. So the additional query seems odd at
> > first glance. But there might well be a good reason for it. In my naive
> > approach I would've thought that sm along the lines of:
> >
> > diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
> > index 923d66d131c1..8b089035b9b3 100644
> > --- a/fs/overlayfs/util.c
> > +++ b/fs/overlayfs/util.c
> > @@ -1119,4 +1119,5 @@ void ovl_copyattr(struct inode *inode)
> >         inode->i_mtime = realinode->i_mtime;
> >         inode->i_ctime = realinode->i_ctime;
> >         i_size_write(inode, i_size_read(realinode));
> > +       inode_set_iversion_raw(inode, inode_peek_iversion_raw(realinode));
> >  }
> > 
> > would've been sufficient.
> > 
> 
> Nope, because then you wouldn't get any updates to i_version after that
> point.
> 
> Note that with an IS_I_VERSION inode we only update the i_version when
> there has been a query since the last update. What you're doing above is
> circumventing that mechanism. You'll get the i_version at the time of of
> the ovl_copyattr, but there won't be any updates of it after that point
> because the QUERIED bit won't end up being set on realinode.

I get all that.
But my understanding had been that the i_version value at the time of
ovl_copyattr() would be correct. Because when ovl_copyattr() is called
the expected i_version change will have been done in the relevant layer
includig raising the QUERIED bit. Since the layers are not allowed to be
changed outside of the overlayfs mount any change to them can only
originate from overlayfs which would necessarily call ovl_copyattr()
again. IOW, overlayfs would by virtue of its implementation keep the
i_version value in sync.

Overlayfs wouldn't even raise SB_I_VERSION. It would indeed just be a
cache of i_version of the relevant layer.

> 
> 
> > Since overlayfs' does explicitly disallow changes to the upper and lower trees
> > while overlayfs is mounted it seems intuitive that it should just mirror the
> > relevant layer's i_version.
> >
> >
> > If we don't do this, then we should probably document that i_version doesn't
> > have a meaning yet for the inodes of stacking filesystems.
> > 
> 
> Trying to cache the i_version is counterproductive, IMO, at least with
> an IS_I_VERSION inode.
> 
> The problem is that a query against the i_version has a side-effect. It
> has to (atomically) mark the inode for an update on the next change.
> 
> If you try to cache that value, you'll likely end up doing more queries
> than you really need to (because you'll need to keep the cache up to
> date) and you'll have an i_version that will necessarily lag the one in
> the upper layer inode.
> 
> The whole point of the change attribute is to get the value as it is at
> this very moment so we can check whether there have been changes. A
> laggy value is not terribly useful.
> 
> Overlayfs should just always call the upper layer's ->getattr to get the
> version. I wouldn't even bother copying it up in the first place. Doing
> so is just encouraging someone to try use the value in the overlayfs
> inode, when they really need to go through ->getattr and get the one
> from the upper layer.

That seems reasonable to me. I read this as an agreeing with my earlier
suggestion to document that i_version doesn't have a meaning for the
inodes of stacking filesystems and that we should spell out that
vfs_getattr()/->getattr() needs to be used to interact with i_version.

We need to explain to subsystems such as IMA somwhere what the correct
way to query i_version agnostically is; independent of filesystem
implementation details.

Looking at IMA, it queries the i_version directly without checking
whether it's an IS_I_VERSION() inode first. This might make a
difference.

Afaict, filesystems that persist i_version to disk automatically raise
SB_I_VERSION. I would guess that it be considered a bug if a filesystem
would persist i_version to disk and not raise SB_I_VERSION. If so IMA
should probably be made to check for IS_I_VERSION() and it will probably
get that by switching to vfs_getattr_nosec().

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-11  8:38                               ` Christian Brauner
@ 2023-04-11  9:32                                 ` Jeff Layton
  2023-04-11  9:49                                   ` Christian Brauner
  2023-04-21 14:55                                 ` Mimi Zohar
  1 sibling, 1 reply; 61+ messages in thread
From: Jeff Layton @ 2023-04-11  9:32 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Amir Goldstein, Stefan Berger, Paul Moore, zohar,
	linux-integrity, miklos, linux-kernel, linux-security-module,
	linux-fsdevel, linux-unionfs

On Tue, 2023-04-11 at 10:38 +0200, Christian Brauner wrote:
> On Sun, Apr 09, 2023 at 06:12:09PM -0400, Jeff Layton wrote:
> > On Sun, 2023-04-09 at 17:22 +0200, Christian Brauner wrote:
> > > On Fri, Apr 07, 2023 at 09:29:29AM -0400, Jeff Layton wrote:
> > > > > > > > 
> > > > > > > > I would ditch the original proposal in favor of this 2-line patch shown here:
> > > > > > > > 
> > > > > > > > https://lore.kernel.org/linux-integrity/a95f62ed-8b8a-38e5-e468-ecbde3b221af@linux.ibm.com/T/#m3bd047c6e5c8200df1d273c0ad551c645dd43232
> > > > > 
> > > > > We should cool it with the quick hacks to fix things. :)
> > > > > 
> > > > 
> > > > Yeah. It might fix this specific testcase, but I think the way it uses
> > > > the i_version is "gameable" in other situations. Then again, I don't
> > > > know a lot about IMA in this regard.
> > > > 
> > > > When is it expected to remeasure? If it's only expected to remeasure on
> > > > a close(), then that's one thing. That would be a weird design though.
> > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > 
> > > > > > > Ok, I think I get it. IMA is trying to use the i_version from the
> > > > > > > overlayfs inode.
> > > > > > > 
> > > > > > > I suspect that the real problem here is that IMA is just doing a bare
> > > > > > > inode_query_iversion. Really, we ought to make IMA call
> > > > > > > vfs_getattr_nosec (or something like it) to query the getattr routine in
> > > > > > > the upper layer. Then overlayfs could just propagate the results from
> > > > > > > the upper layer in its response.
> > > > > > > 
> > > > > > > That sort of design may also eventually help IMA work properly with more
> > > > > > > exotic filesystems, like NFS or Ceph.
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > 
> > > > > > Maybe something like this? It builds for me but I haven't tested it. It
> > > > > > looks like overlayfs already should report the upper layer's i_version
> > > > > > in getattr, though I haven't tested that either:
> > > > > > 
> > > > > > -----------------------8<---------------------------
> > > > > > 
> > > > > > [PATCH] IMA: use vfs_getattr_nosec to get the i_version
> > > > > > 
> > > > > > IMA currently accesses the i_version out of the inode directly when it
> > > > > > does a measurement. This is fine for most simple filesystems, but can be
> > > > > > problematic with more complex setups (e.g. overlayfs).
> > > > > > 
> > > > > > Make IMA instead call vfs_getattr_nosec to get this info. This allows
> > > > > > the filesystem to determine whether and how to report the i_version, and
> > > > > > should allow IMA to work properly with a broader class of filesystems in
> > > > > > the future.
> > > > > > 
> > > > > > Reported-by: Stefan Berger <stefanb@linux.ibm.com>
> > > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > > > ---
> > > > > 
> > > > > So, I think we want both; we want the ovl_copyattr() and the
> > > > > vfs_getattr_nosec() change:
> > > > > 
> > > > > (1) overlayfs should copy up the inode version in ovl_copyattr(). That
> > > > >     is in line what we do with all other inode attributes. IOW, the
> > > > >     overlayfs inode's i_version counter should aim to mirror the
> > > > >     relevant layer's i_version counter. I wouldn't know why that
> > > > >     shouldn't be the case. Asking the other way around there doesn't
> > > > >     seem to be any use for overlayfs inodes to have an i_version that
> > > > >     isn't just mirroring the relevant layer's i_version.
> > > > 
> > > > It's less than ideal to do this IMO, particularly with an IS_I_VERSION
> > > > inode.
> > > > 
> > > > You can't just copy up the value from the upper. You'll need to call
> > > > inode_query_iversion(upper_inode), which will flag the upper inode for a
> > > > logged i_version update on the next write. IOW, this could create some
> > > > (probably minor) metadata write amplification in the upper layer inode
> > > > with IS_I_VERSION inodes.
> > > 
> > > I'm likely just missing context and am curious about this so bear with me. Why
> > > do we need to flag the upper inode for a logged i_version update? Any required
> > > i_version interactions should've already happened when overlayfs called into
> > > the upper layer. So all that's left to do is for overlayfs' to mirror the
> > > i_version value after the upper operation has returned.
> > 
> > > ovl_copyattr() - which copies the inode attributes - is always called after the
> > > operation on the upper inode has finished. So the additional query seems odd at
> > > first glance. But there might well be a good reason for it. In my naive
> > > approach I would've thought that sm along the lines of:
> > > 
> > > diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
> > > index 923d66d131c1..8b089035b9b3 100644
> > > --- a/fs/overlayfs/util.c
> > > +++ b/fs/overlayfs/util.c
> > > @@ -1119,4 +1119,5 @@ void ovl_copyattr(struct inode *inode)
> > >         inode->i_mtime = realinode->i_mtime;
> > >         inode->i_ctime = realinode->i_ctime;
> > >         i_size_write(inode, i_size_read(realinode));
> > > +       inode_set_iversion_raw(inode, inode_peek_iversion_raw(realinode));
> > >  }
> > > 
> > > would've been sufficient.
> > > 
> > 
> > Nope, because then you wouldn't get any updates to i_version after that
> > point.
> > 
> > Note that with an IS_I_VERSION inode we only update the i_version when
> > there has been a query since the last update. What you're doing above is
> > circumventing that mechanism. You'll get the i_version at the time of of
> > the ovl_copyattr, but there won't be any updates of it after that point
> > because the QUERIED bit won't end up being set on realinode.
> 
> I get all that.
> But my understanding had been that the i_version value at the time of
> ovl_copyattr() would be correct. Because when ovl_copyattr() is called
> the expected i_version change will have been done in the relevant layer
> includig raising the QUERIED bit. Since the layers are not allowed to be
> changed outside of the overlayfs mount any change to them can only
> originate from overlayfs which would necessarily call ovl_copyattr()
> again. IOW, overlayfs would by virtue of its implementation keep the
> i_version value in sync.
>
> Overlayfs wouldn't even raise SB_I_VERSION. It would indeed just be a
> cache of i_version of the relevant layer.
> 
> > 
> > 
> > > Since overlayfs' does explicitly disallow changes to the upper and lower trees
> > > while overlayfs is mounted it seems intuitive that it should just mirror the
> > > relevant layer's i_version.
> > > 
> > > 
> > > If we don't do this, then we should probably document that i_version doesn't
> > > have a meaning yet for the inodes of stacking filesystems.
> > > 
> > 
> > Trying to cache the i_version is counterproductive, IMO, at least with
> > an IS_I_VERSION inode.
> > 
> > The problem is that a query against the i_version has a side-effect. It
> > has to (atomically) mark the inode for an update on the next change.
> > 
> > If you try to cache that value, you'll likely end up doing more queries
> > than you really need to (because you'll need to keep the cache up to
> > date) and you'll have an i_version that will necessarily lag the one in
> > the upper layer inode.
> > 
> > The whole point of the change attribute is to get the value as it is at
> > this very moment so we can check whether there have been changes. A
> > laggy value is not terribly useful.
> > 
> > Overlayfs should just always call the upper layer's ->getattr to get the
> > version. I wouldn't even bother copying it up in the first place. Doing
> > so is just encouraging someone to try use the value in the overlayfs
> > inode, when they really need to go through ->getattr and get the one
> > from the upper layer.
> 
> That seems reasonable to me. I read this as an agreeing with my earlier
> suggestion to document that i_version doesn't have a meaning for the
> inodes of stacking filesystems and that we should spell out that
> vfs_getattr()/->getattr() needs to be used to interact with i_version.
> 

It really has no meaning in the stacked filesystem's _inode_. The only
i_version that has any meaning in a (simple) stacking setup is the upper
layer inode.

> We need to explain to subsystems such as IMA somwhere what the correct
> way to query i_version agnostically is; independent of filesystem
> implementation details.
> 
> Looking at IMA, it queries the i_version directly without checking
> whether it's an IS_I_VERSION() inode first. This might make a
> difference.
> 

IMA should just use getattr. That allows the filesystem to present the
i_version to the caller as it sees fit. Fetching i_version directly
without testing for IS_I_VERSION is wrong, because you don't know what
that field contains, or whether the fs supports it at all.

> Afaict, filesystems that persist i_version to disk automatically raise
> SB_I_VERSION. I would guess that it be considered a bug if a filesystem
> would persist i_version to disk and not raise SB_I_VERSION. If so IMA
> should probably be made to check for IS_I_VERSION() and it will probably
> get that by switching to vfs_getattr_nosec().

Not quite. SB_I_VERSION tells the vfs that the filesystem wants the
kernel to manage the increment of the i_version for it. The filesystem
is still responsible for persisting that value to disk (if appropriate).

Switching to vfs_getattr_nosec should make it so IMA doesn't need to
worry about the gory details of all of this. If STATX_CHANGE_COOKIE is
set in the response, then it can trust that value. Otherwise, it's no
good.

-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-11  9:32                                 ` Jeff Layton
@ 2023-04-11  9:49                                   ` Christian Brauner
  2023-04-11 10:13                                     ` Jeff Layton
  0 siblings, 1 reply; 61+ messages in thread
From: Christian Brauner @ 2023-04-11  9:49 UTC (permalink / raw)
  To: Jeff Layton
  Cc: Amir Goldstein, Stefan Berger, Paul Moore, zohar,
	linux-integrity, miklos, linux-kernel, linux-security-module,
	linux-fsdevel, linux-unionfs

On Tue, Apr 11, 2023 at 05:32:11AM -0400, Jeff Layton wrote:
> On Tue, 2023-04-11 at 10:38 +0200, Christian Brauner wrote:
> > On Sun, Apr 09, 2023 at 06:12:09PM -0400, Jeff Layton wrote:
> > > On Sun, 2023-04-09 at 17:22 +0200, Christian Brauner wrote:
> > > > On Fri, Apr 07, 2023 at 09:29:29AM -0400, Jeff Layton wrote:
> > > > > > > > > 
> > > > > > > > > I would ditch the original proposal in favor of this 2-line patch shown here:
> > > > > > > > > 
> > > > > > > > > https://lore.kernel.org/linux-integrity/a95f62ed-8b8a-38e5-e468-ecbde3b221af@linux.ibm.com/T/#m3bd047c6e5c8200df1d273c0ad551c645dd43232
> > > > > > 
> > > > > > We should cool it with the quick hacks to fix things. :)
> > > > > > 
> > > > > 
> > > > > Yeah. It might fix this specific testcase, but I think the way it uses
> > > > > the i_version is "gameable" in other situations. Then again, I don't
> > > > > know a lot about IMA in this regard.
> > > > > 
> > > > > When is it expected to remeasure? If it's only expected to remeasure on
> > > > > a close(), then that's one thing. That would be a weird design though.
> > > > > 
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > 
> > > > > > > > Ok, I think I get it. IMA is trying to use the i_version from the
> > > > > > > > overlayfs inode.
> > > > > > > > 
> > > > > > > > I suspect that the real problem here is that IMA is just doing a bare
> > > > > > > > inode_query_iversion. Really, we ought to make IMA call
> > > > > > > > vfs_getattr_nosec (or something like it) to query the getattr routine in
> > > > > > > > the upper layer. Then overlayfs could just propagate the results from
> > > > > > > > the upper layer in its response.
> > > > > > > > 
> > > > > > > > That sort of design may also eventually help IMA work properly with more
> > > > > > > > exotic filesystems, like NFS or Ceph.
> > > > > > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > 
> > > > > > > Maybe something like this? It builds for me but I haven't tested it. It
> > > > > > > looks like overlayfs already should report the upper layer's i_version
> > > > > > > in getattr, though I haven't tested that either:
> > > > > > > 
> > > > > > > -----------------------8<---------------------------
> > > > > > > 
> > > > > > > [PATCH] IMA: use vfs_getattr_nosec to get the i_version
> > > > > > > 
> > > > > > > IMA currently accesses the i_version out of the inode directly when it
> > > > > > > does a measurement. This is fine for most simple filesystems, but can be
> > > > > > > problematic with more complex setups (e.g. overlayfs).
> > > > > > > 
> > > > > > > Make IMA instead call vfs_getattr_nosec to get this info. This allows
> > > > > > > the filesystem to determine whether and how to report the i_version, and
> > > > > > > should allow IMA to work properly with a broader class of filesystems in
> > > > > > > the future.
> > > > > > > 
> > > > > > > Reported-by: Stefan Berger <stefanb@linux.ibm.com>
> > > > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > > > > ---
> > > > > > 
> > > > > > So, I think we want both; we want the ovl_copyattr() and the
> > > > > > vfs_getattr_nosec() change:
> > > > > > 
> > > > > > (1) overlayfs should copy up the inode version in ovl_copyattr(). That
> > > > > >     is in line what we do with all other inode attributes. IOW, the
> > > > > >     overlayfs inode's i_version counter should aim to mirror the
> > > > > >     relevant layer's i_version counter. I wouldn't know why that
> > > > > >     shouldn't be the case. Asking the other way around there doesn't
> > > > > >     seem to be any use for overlayfs inodes to have an i_version that
> > > > > >     isn't just mirroring the relevant layer's i_version.
> > > > > 
> > > > > It's less than ideal to do this IMO, particularly with an IS_I_VERSION
> > > > > inode.
> > > > > 
> > > > > You can't just copy up the value from the upper. You'll need to call
> > > > > inode_query_iversion(upper_inode), which will flag the upper inode for a
> > > > > logged i_version update on the next write. IOW, this could create some
> > > > > (probably minor) metadata write amplification in the upper layer inode
> > > > > with IS_I_VERSION inodes.
> > > > 
> > > > I'm likely just missing context and am curious about this so bear with me. Why
> > > > do we need to flag the upper inode for a logged i_version update? Any required
> > > > i_version interactions should've already happened when overlayfs called into
> > > > the upper layer. So all that's left to do is for overlayfs' to mirror the
> > > > i_version value after the upper operation has returned.
> > > 
> > > > ovl_copyattr() - which copies the inode attributes - is always called after the
> > > > operation on the upper inode has finished. So the additional query seems odd at
> > > > first glance. But there might well be a good reason for it. In my naive
> > > > approach I would've thought that sm along the lines of:
> > > > 
> > > > diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
> > > > index 923d66d131c1..8b089035b9b3 100644
> > > > --- a/fs/overlayfs/util.c
> > > > +++ b/fs/overlayfs/util.c
> > > > @@ -1119,4 +1119,5 @@ void ovl_copyattr(struct inode *inode)
> > > >         inode->i_mtime = realinode->i_mtime;
> > > >         inode->i_ctime = realinode->i_ctime;
> > > >         i_size_write(inode, i_size_read(realinode));
> > > > +       inode_set_iversion_raw(inode, inode_peek_iversion_raw(realinode));
> > > >  }
> > > > 
> > > > would've been sufficient.
> > > > 
> > > 
> > > Nope, because then you wouldn't get any updates to i_version after that
> > > point.
> > > 
> > > Note that with an IS_I_VERSION inode we only update the i_version when
> > > there has been a query since the last update. What you're doing above is
> > > circumventing that mechanism. You'll get the i_version at the time of of
> > > the ovl_copyattr, but there won't be any updates of it after that point
> > > because the QUERIED bit won't end up being set on realinode.
> > 
> > I get all that.
> > But my understanding had been that the i_version value at the time of
> > ovl_copyattr() would be correct. Because when ovl_copyattr() is called
> > the expected i_version change will have been done in the relevant layer
> > includig raising the QUERIED bit. Since the layers are not allowed to be
> > changed outside of the overlayfs mount any change to them can only
> > originate from overlayfs which would necessarily call ovl_copyattr()
> > again. IOW, overlayfs would by virtue of its implementation keep the
> > i_version value in sync.
> >
> > Overlayfs wouldn't even raise SB_I_VERSION. It would indeed just be a
> > cache of i_version of the relevant layer.
> > 
> > > 
> > > 
> > > > Since overlayfs' does explicitly disallow changes to the upper and lower trees
> > > > while overlayfs is mounted it seems intuitive that it should just mirror the
> > > > relevant layer's i_version.
> > > > 
> > > > 
> > > > If we don't do this, then we should probably document that i_version doesn't
> > > > have a meaning yet for the inodes of stacking filesystems.
> > > > 
> > > 
> > > Trying to cache the i_version is counterproductive, IMO, at least with
> > > an IS_I_VERSION inode.
> > > 
> > > The problem is that a query against the i_version has a side-effect. It
> > > has to (atomically) mark the inode for an update on the next change.
> > > 
> > > If you try to cache that value, you'll likely end up doing more queries
> > > than you really need to (because you'll need to keep the cache up to
> > > date) and you'll have an i_version that will necessarily lag the one in
> > > the upper layer inode.
> > > 
> > > The whole point of the change attribute is to get the value as it is at
> > > this very moment so we can check whether there have been changes. A
> > > laggy value is not terribly useful.
> > > 
> > > Overlayfs should just always call the upper layer's ->getattr to get the
> > > version. I wouldn't even bother copying it up in the first place. Doing
> > > so is just encouraging someone to try use the value in the overlayfs
> > > inode, when they really need to go through ->getattr and get the one
> > > from the upper layer.
> > 
> > That seems reasonable to me. I read this as an agreeing with my earlier
> > suggestion to document that i_version doesn't have a meaning for the
> > inodes of stacking filesystems and that we should spell out that
> > vfs_getattr()/->getattr() needs to be used to interact with i_version.
> > 
> 
> It really has no meaning in the stacked filesystem's _inode_. The only
> i_version that has any meaning in a (simple) stacking setup is the upper
> layer inode.

Ok, we're on the same page then.

> 
> > We need to explain to subsystems such as IMA somwhere what the correct
> > way to query i_version agnostically is; independent of filesystem
> > implementation details.
> > 
> > Looking at IMA, it queries the i_version directly without checking
> > whether it's an IS_I_VERSION() inode first. This might make a
> > difference.
> > 
> 
> IMA should just use getattr. That allows the filesystem to present the
> i_version to the caller as it sees fit. Fetching i_version directly
> without testing for IS_I_VERSION is wrong, because you don't know what
> that field contains, or whether the fs supports it at all.

Yep, same page again.

> 
> > Afaict, filesystems that persist i_version to disk automatically raise
> > SB_I_VERSION. I would guess that it be considered a bug if a filesystem
> > would persist i_version to disk and not raise SB_I_VERSION. If so IMA
> > should probably be made to check for IS_I_VERSION() and it will probably
> > get that by switching to vfs_getattr_nosec().
> 
> Not quite. SB_I_VERSION tells the vfs that the filesystem wants the
> kernel to manage the increment of the i_version for it. The filesystem
> is still responsible for persisting that value to disk (if appropriate).

Yes, sure it's the filesystems responsibility to persist it to disk or
not. What I tried to ask was that when a filesystem does persist
i_version to disk then would it be legal to mount it without
SB_I_VERSION (because ext2/ext3 did use to have that mount option)? If
it would then the filesystem would probably need to take care to leave
the i_version field in struct inode uninitialized to avoid confusion or
would that just work? (Mere curiosity, don't feel obligated to go into
detail here. I don't want to hog your time.)

> 
> Switching to vfs_getattr_nosec should make it so IMA doesn't need to
> worry about the gory details of all of this. If STATX_CHANGE_COOKIE is
> set in the response, then it can trust that value. Otherwise, it's no
> good.

Yep, same page again.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-11  9:49                                   ` Christian Brauner
@ 2023-04-11 10:13                                     ` Jeff Layton
  2023-04-11 14:08                                       ` Christian Brauner
  0 siblings, 1 reply; 61+ messages in thread
From: Jeff Layton @ 2023-04-11 10:13 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Amir Goldstein, Stefan Berger, Paul Moore, zohar,
	linux-integrity, miklos, linux-kernel, linux-security-module,
	linux-fsdevel, linux-unionfs

On Tue, 2023-04-11 at 11:49 +0200, Christian Brauner wrote:
> 
> > 
> > > Afaict, filesystems that persist i_version to disk automatically raise
> > > SB_I_VERSION. I would guess that it be considered a bug if a filesystem
> > > would persist i_version to disk and not raise SB_I_VERSION. If so IMA
> > > should probably be made to check for IS_I_VERSION() and it will probably
> > > get that by switching to vfs_getattr_nosec().
> > 
> > Not quite. SB_I_VERSION tells the vfs that the filesystem wants the
> > kernel to manage the increment of the i_version for it. The filesystem
> > is still responsible for persisting that value to disk (if appropriate).
> 
> Yes, sure it's the filesystems responsibility to persist it to disk or
> not. What I tried to ask was that when a filesystem does persist
> i_version to disk then would it be legal to mount it without
> SB_I_VERSION (because ext2/ext3 did use to have that mount option)? If
> it would then the filesystem would probably need to take care to leave
> the i_version field in struct inode uninitialized to avoid confusion or
> would that just work? (Mere curiosity, don't feel obligated to go into
> detail here. I don't want to hog your time.)
> 

In modern kernels, not setting SB_I_VERSION would mainly have the effect
of stopping increments of i_version field on write. It would also mean
that the STATX_CHANGE_COOKIE is not automatically reported via getattr.

You probably wouldn't want to mount the fs without SB_I_VERSION set. The
missing increments could trick an observer into believing that nothing
had changed in the file across mounts when it actually had.
-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-11 10:13                                     ` Jeff Layton
@ 2023-04-11 14:08                                       ` Christian Brauner
  0 siblings, 0 replies; 61+ messages in thread
From: Christian Brauner @ 2023-04-11 14:08 UTC (permalink / raw)
  To: Jeff Layton
  Cc: Amir Goldstein, Stefan Berger, Paul Moore, zohar,
	linux-integrity, miklos, linux-kernel, linux-security-module,
	linux-fsdevel, linux-unionfs

On Tue, Apr 11, 2023 at 06:13:15AM -0400, Jeff Layton wrote:
> On Tue, 2023-04-11 at 11:49 +0200, Christian Brauner wrote:
> > 
> > > 
> > > > Afaict, filesystems that persist i_version to disk automatically raise
> > > > SB_I_VERSION. I would guess that it be considered a bug if a filesystem
> > > > would persist i_version to disk and not raise SB_I_VERSION. If so IMA
> > > > should probably be made to check for IS_I_VERSION() and it will probably
> > > > get that by switching to vfs_getattr_nosec().
> > > 
> > > Not quite. SB_I_VERSION tells the vfs that the filesystem wants the
> > > kernel to manage the increment of the i_version for it. The filesystem
> > > is still responsible for persisting that value to disk (if appropriate).
> > 
> > Yes, sure it's the filesystems responsibility to persist it to disk or
> > not. What I tried to ask was that when a filesystem does persist
> > i_version to disk then would it be legal to mount it without
> > SB_I_VERSION (because ext2/ext3 did use to have that mount option)? If
> > it would then the filesystem would probably need to take care to leave
> > the i_version field in struct inode uninitialized to avoid confusion or
> > would that just work? (Mere curiosity, don't feel obligated to go into
> > detail here. I don't want to hog your time.)
> > 
> 
> In modern kernels, not setting SB_I_VERSION would mainly have the effect
> of stopping increments of i_version field on write. It would also mean
> that the STATX_CHANGE_COOKIE is not automatically reported via getattr.

Ah, good.

> 
> You probably wouldn't want to mount the fs without SB_I_VERSION set. The
> missing increments could trick an observer into believing that nothing
> had changed in the file across mounts when it actually had.

Yeah, that's what I thought and that would potentially be an attack on
IMA which is why I asked.

Thanks!
Christian

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-07 13:29                         ` Jeff Layton
  2023-04-09 15:22                           ` Christian Brauner
@ 2023-04-17  1:57                           ` Stefan Berger
  2023-04-17  8:11                             ` Christian Brauner
  2023-04-17 10:05                             ` Jeff Layton
  2023-04-21 14:43                           ` Mimi Zohar
  2 siblings, 2 replies; 61+ messages in thread
From: Stefan Berger @ 2023-04-17  1:57 UTC (permalink / raw)
  To: Jeff Layton, Christian Brauner, Amir Goldstein
  Cc: Paul Moore, zohar, linux-integrity, miklos, linux-kernel,
	linux-security-module, linux-fsdevel, linux-unionfs



On 4/7/23 09:29, Jeff Layton wrote:
>>>>>
>>>>> I would ditch the original proposal in favor of this 2-line patch shown here:
>>>>>
>>>>> https://lore.kernel.org/linux-integrity/a95f62ed-8b8a-38e5-e468-ecbde3b221af@linux.ibm.com/T/#m3bd047c6e5c8200df1d273c0ad551c645dd43232
>>
>> We should cool it with the quick hacks to fix things. :)
>>
> 
> Yeah. It might fix this specific testcase, but I think the way it uses
> the i_version is "gameable" in other situations. Then again, I don't
> know a lot about IMA in this regard.
> 
> When is it expected to remeasure? If it's only expected to remeasure on
> a close(), then that's one thing. That would be a weird design though.

IMA should remeasure the file when it has visibly changed for another thread or process.


>>> -----------------------8<---------------------------
>>>
>>> [PATCH] IMA: use vfs_getattr_nosec to get the i_version
>>>
>>> IMA currently accesses the i_version out of the inode directly when it
>>> does a measurement. This is fine for most simple filesystems, but can be
>>> problematic with more complex setups (e.g. overlayfs).
>>>
>>> Make IMA instead call vfs_getattr_nosec to get this info. This allows
>>> the filesystem to determine whether and how to report the i_version, and
>>> should allow IMA to work properly with a broader class of filesystems in
>>> the future.
>>>
>>> Reported-by: Stefan Berger <stefanb@linux.ibm.com>
>>> Signed-off-by: Jeff Layton <jlayton@kernel.org>
>>> ---
>>
>> So, I think we want both; we want the ovl_copyattr() and the
>> vfs_getattr_nosec() change:
>>
>> (1) overlayfs should copy up the inode version in ovl_copyattr(). That
>>      is in line what we do with all other inode attributes. IOW, the
>>      overlayfs inode's i_version counter should aim to mirror the
>>      relevant layer's i_version counter. I wouldn't know why that
>>      shouldn't be the case. Asking the other way around there doesn't
>>      seem to be any use for overlayfs inodes to have an i_version that
>>      isn't just mirroring the relevant layer's i_version.
> 
> It's less than ideal to do this IMO, particularly with an IS_I_VERSION
> inode.
> 
> You can't just copy up the value from the upper. You'll need to call
> inode_query_iversion(upper_inode), which will flag the upper inode for a
> logged i_version update on the next write. IOW, this could create some
> (probably minor) metadata write amplification in the upper layer inode
> with IS_I_VERSION inodes.
> 
> 
>> (2) Jeff's changes for ima to make it rely on vfs_getattr_nosec().
>>      Currently, ima assumes that it will get the correct i_version from
>>      an inode but that just doesn't hold for stacking filesystem.
>>
>> While (1) would likely just fix the immediate bug (2) is correct and
>> _robust_. If we change how attributes are handled vfs_*() helpers will
>> get updated and ima with it. Poking at raw inodes without using
>> appropriate helpers is much more likely to get ima into trouble.
> 
> This will fix it the right way, I think (assuming it actually works),
> and should open the door for IMA to work properly with networked
> filesystems that support i_version as well.
> 
> Note that there Stephen is correct that calling getattr is probably
> going to be less efficient here since we're going to end up calling
> generic_fillattr unnecessarily, but I still think it's the right thing
> to do.

I was wondering whether to use the existing inode_eq_iversion() for all
other filesystems than overlayfs, nfs, and possibly other ones (which ones?)
where we would use the vfs_getattr_nosec() via a case on inode->i_sb->s_magic?
If so, would this function be generic enough to be a public function for libfs.c?

I'll hopefully be able to test the proposed patch tomorrow.

> 
> If it turns out to cause measurable performance regressions though,
> maybe we can look at adding a something that still calls ->getattr if it
> exists but only returns the change_cookie value.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-17  1:57                           ` Stefan Berger
@ 2023-04-17  8:11                             ` Christian Brauner
  2023-04-17 10:05                             ` Jeff Layton
  1 sibling, 0 replies; 61+ messages in thread
From: Christian Brauner @ 2023-04-17  8:11 UTC (permalink / raw)
  To: Stefan Berger
  Cc: Jeff Layton, Amir Goldstein, Paul Moore, zohar, linux-integrity,
	miklos, linux-kernel, linux-security-module, linux-fsdevel,
	linux-unionfs

On Sun, Apr 16, 2023 at 09:57:10PM -0400, Stefan Berger wrote:
> 
> 
> On 4/7/23 09:29, Jeff Layton wrote:
> > > > > > 
> > > > > > I would ditch the original proposal in favor of this 2-line patch shown here:
> > > > > > 
> > > > > > https://lore.kernel.org/linux-integrity/a95f62ed-8b8a-38e5-e468-ecbde3b221af@linux.ibm.com/T/#m3bd047c6e5c8200df1d273c0ad551c645dd43232
> > > 
> > > We should cool it with the quick hacks to fix things. :)
> > > 
> > 
> > Yeah. It might fix this specific testcase, but I think the way it uses
> > the i_version is "gameable" in other situations. Then again, I don't
> > know a lot about IMA in this regard.
> > 
> > When is it expected to remeasure? If it's only expected to remeasure on
> > a close(), then that's one thing. That would be a weird design though.
> 
> IMA should remeasure the file when it has visibly changed for another thread or process.
> 
> 
> > > > -----------------------8<---------------------------
> > > > 
> > > > [PATCH] IMA: use vfs_getattr_nosec to get the i_version
> > > > 
> > > > IMA currently accesses the i_version out of the inode directly when it
> > > > does a measurement. This is fine for most simple filesystems, but can be
> > > > problematic with more complex setups (e.g. overlayfs).
> > > > 
> > > > Make IMA instead call vfs_getattr_nosec to get this info. This allows
> > > > the filesystem to determine whether and how to report the i_version, and
> > > > should allow IMA to work properly with a broader class of filesystems in
> > > > the future.
> > > > 
> > > > Reported-by: Stefan Berger <stefanb@linux.ibm.com>
> > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > ---
> > > 
> > > So, I think we want both; we want the ovl_copyattr() and the
> > > vfs_getattr_nosec() change:
> > > 
> > > (1) overlayfs should copy up the inode version in ovl_copyattr(). That
> > >      is in line what we do with all other inode attributes. IOW, the
> > >      overlayfs inode's i_version counter should aim to mirror the
> > >      relevant layer's i_version counter. I wouldn't know why that
> > >      shouldn't be the case. Asking the other way around there doesn't
> > >      seem to be any use for overlayfs inodes to have an i_version that
> > >      isn't just mirroring the relevant layer's i_version.
> > 
> > It's less than ideal to do this IMO, particularly with an IS_I_VERSION
> > inode.
> > 
> > You can't just copy up the value from the upper. You'll need to call
> > inode_query_iversion(upper_inode), which will flag the upper inode for a
> > logged i_version update on the next write. IOW, this could create some
> > (probably minor) metadata write amplification in the upper layer inode
> > with IS_I_VERSION inodes.
> > 
> > 
> > > (2) Jeff's changes for ima to make it rely on vfs_getattr_nosec().
> > >      Currently, ima assumes that it will get the correct i_version from
> > >      an inode but that just doesn't hold for stacking filesystem.
> > > 
> > > While (1) would likely just fix the immediate bug (2) is correct and
> > > _robust_. If we change how attributes are handled vfs_*() helpers will
> > > get updated and ima with it. Poking at raw inodes without using
> > > appropriate helpers is much more likely to get ima into trouble.
> > 
> > This will fix it the right way, I think (assuming it actually works),
> > and should open the door for IMA to work properly with networked
> > filesystems that support i_version as well.
> > 
> > Note that there Stephen is correct that calling getattr is probably
> > going to be less efficient here since we're going to end up calling
> > generic_fillattr unnecessarily, but I still think it's the right thing
> > to do.
> 
> I was wondering whether to use the existing inode_eq_iversion() for all
> other filesystems than overlayfs, nfs, and possibly other ones (which ones?)
> where we would use the vfs_getattr_nosec() via a case on inode->i_sb->s_magic?
> If so, would this function be generic enough to be a public function for libfs.c?

That's just an invitation for bugs and maintenance headaches. Just call
vfs_getattr_nosec() directly and measure the performance impact before
trying to optimize this. If you see performance impact that is worth
mentioning then we can explore other options such as allowing
->getattr() to only query for i_version and nothing else.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-17  1:57                           ` Stefan Berger
  2023-04-17  8:11                             ` Christian Brauner
@ 2023-04-17 10:05                             ` Jeff Layton
  2023-04-17 12:45                               ` Stefan Berger
  1 sibling, 1 reply; 61+ messages in thread
From: Jeff Layton @ 2023-04-17 10:05 UTC (permalink / raw)
  To: Stefan Berger, Christian Brauner, Amir Goldstein
  Cc: Paul Moore, zohar, linux-integrity, miklos, linux-kernel,
	linux-security-module, linux-fsdevel, linux-unionfs

On Sun, 2023-04-16 at 21:57 -0400, Stefan Berger wrote:
> 
> On 4/7/23 09:29, Jeff Layton wrote:
> > > > > > 
> > > > > > I would ditch the original proposal in favor of this 2-line patch shown here:
> > > > > > 
> > > > > > https://lore.kernel.org/linux-integrity/a95f62ed-8b8a-38e5-e468-ecbde3b221af@linux.ibm.com/T/#m3bd047c6e5c8200df1d273c0ad551c645dd43232
> > > 
> > > We should cool it with the quick hacks to fix things. :)
> > > 
> > 
> > Yeah. It might fix this specific testcase, but I think the way it uses
> > the i_version is "gameable" in other situations. Then again, I don't
> > know a lot about IMA in this regard.
> > 
> > When is it expected to remeasure? If it's only expected to remeasure on
> > a close(), then that's one thing. That would be a weird design though.
> 
> IMA should remeasure the file when it has visibly changed for another thread or process.
> 
> 
> > > > -----------------------8<---------------------------
> > > > 
> > > > [PATCH] IMA: use vfs_getattr_nosec to get the i_version
> > > > 
> > > > IMA currently accesses the i_version out of the inode directly when it
> > > > does a measurement. This is fine for most simple filesystems, but can be
> > > > problematic with more complex setups (e.g. overlayfs).
> > > > 
> > > > Make IMA instead call vfs_getattr_nosec to get this info. This allows
> > > > the filesystem to determine whether and how to report the i_version, and
> > > > should allow IMA to work properly with a broader class of filesystems in
> > > > the future.
> > > > 
> > > > Reported-by: Stefan Berger <stefanb@linux.ibm.com>
> > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > ---
> > > 
> > > So, I think we want both; we want the ovl_copyattr() and the
> > > vfs_getattr_nosec() change:
> > > 
> > > (1) overlayfs should copy up the inode version in ovl_copyattr(). That
> > >      is in line what we do with all other inode attributes. IOW, the
> > >      overlayfs inode's i_version counter should aim to mirror the
> > >      relevant layer's i_version counter. I wouldn't know why that
> > >      shouldn't be the case. Asking the other way around there doesn't
> > >      seem to be any use for overlayfs inodes to have an i_version that
> > >      isn't just mirroring the relevant layer's i_version.
> > 
> > It's less than ideal to do this IMO, particularly with an IS_I_VERSION
> > inode.
> > 
> > You can't just copy up the value from the upper. You'll need to call
> > inode_query_iversion(upper_inode), which will flag the upper inode for a
> > logged i_version update on the next write. IOW, this could create some
> > (probably minor) metadata write amplification in the upper layer inode
> > with IS_I_VERSION inodes.
> > 
> > 
> > > (2) Jeff's changes for ima to make it rely on vfs_getattr_nosec().
> > >      Currently, ima assumes that it will get the correct i_version from
> > >      an inode but that just doesn't hold for stacking filesystem.
> > > 
> > > While (1) would likely just fix the immediate bug (2) is correct and
> > > _robust_. If we change how attributes are handled vfs_*() helpers will
> > > get updated and ima with it. Poking at raw inodes without using
> > > appropriate helpers is much more likely to get ima into trouble.
> > 
> > This will fix it the right way, I think (assuming it actually works),
> > and should open the door for IMA to work properly with networked
> > filesystems that support i_version as well.
> > 
> > Note that there Stephen is correct that calling getattr is probably
> > going to be less efficient here since we're going to end up calling
> > generic_fillattr unnecessarily, but I still think it's the right thing
> > to do.
> 
> I was wondering whether to use the existing inode_eq_iversion() for all
> other filesystems than overlayfs, nfs, and possibly other ones (which ones?)
> where we would use the vfs_getattr_nosec() via a case on inode->i_sb->s_magic?
> If so, would this function be generic enough to be a public function for libfs.c?
> 
> I'll hopefully be able to test the proposed patch tomorrow.
> 
> 

No, you don't want to use inode_eq_iversion here because (as the comment
over it says):

 * Note that we don't need to set the QUERIED flag in this case, as the value
 * in the inode is not being recorded for later use.

The IMA code _does_ record the value for later use. Furthermore, it's
not valid to use inode_eq_iversion on a non-IS_I_VERSION inode, so it's
better to just use vfs_getattr_nosec which allows IMA to avoid all of
those gory details.

Thanks,
-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-17 10:05                             ` Jeff Layton
@ 2023-04-17 12:45                               ` Stefan Berger
  2023-04-17 13:18                                 ` Jeff Layton
  0 siblings, 1 reply; 61+ messages in thread
From: Stefan Berger @ 2023-04-17 12:45 UTC (permalink / raw)
  To: Jeff Layton, Christian Brauner, Amir Goldstein
  Cc: Paul Moore, zohar, linux-integrity, miklos, linux-kernel,
	linux-security-module, linux-fsdevel, linux-unionfs



On 4/17/23 06:05, Jeff Layton wrote:
> On Sun, 2023-04-16 at 21:57 -0400, Stefan Berger wrote:
>>
>> On 4/7/23 09:29, Jeff Layton wrote:

>>>
>>> Note that there Stephen is correct that calling getattr is probably
>>> going to be less efficient here since we're going to end up calling
>>> generic_fillattr unnecessarily, but I still think it's the right thing
>>> to do.
>>
>> I was wondering whether to use the existing inode_eq_iversion() for all
>> other filesystems than overlayfs, nfs, and possibly other ones (which ones?)
>> where we would use the vfs_getattr_nosec() via a case on inode->i_sb->s_magic?
>> If so, would this function be generic enough to be a public function for libfs.c?
>>
>> I'll hopefully be able to test the proposed patch tomorrow.
>>
>>
> 
> No, you don't want to use inode_eq_iversion here because (as the comment
> over it says):

In the ima_check_last_writer() case the usage of inode_eq_iversion() was correct since
at this point no record of  its value was made and therefore no writer needed to change
the i_value again due to IMA:

		update = test_and_clear_bit(IMA_UPDATE_XATTR,
					    &iint->atomic_flags);
		if (!IS_I_VERSION(inode) ||
		    !inode_eq_iversion(inode, iint->version) ||
		    (iint->flags & IMA_NEW_FILE)) {
			iint->flags &= ~(IMA_DONE_MASK | IMA_NEW_FILE);
			iint->measured_pcrs = 0;
			if (update)
				ima_update_xattr(iint, file);
		}

The record of the value is only made when the actual measurement is done in
ima_collect_measurement()

Compared to this the usage of vfs_getattr_nosec() is expensive since it resets the flag.

         if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) {
                 stat->result_mask |= STATX_CHANGE_COOKIE;
                 stat->change_cookie = inode_query_iversion(inode);
         }

	idmap = mnt_idmap(path->mnt);
	if (inode->i_op->getattr)
		return inode->i_op->getattr(idmap, path, stat,
					    request_mask, query_flags);

Also, many filesystems will have their getattr now called as well.

I understand Christian's argument about the maintenance headache to a certain degree...

    Stefan

> 
>   * Note that we don't need to set the QUERIED flag in this case, as the value
>   * in the inode is not being recorded for later use.
> 
> The IMA code _does_ record the value for later use. Furthermore, it's
> not valid to use inode_eq_iversion on a non-IS_I_VERSION inode, so it's
> better to just use vfs_getattr_nosec which allows IMA to avoid all of
> those gory details.
> 
> Thanks,

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-17 12:45                               ` Stefan Berger
@ 2023-04-17 13:18                                 ` Jeff Layton
  0 siblings, 0 replies; 61+ messages in thread
From: Jeff Layton @ 2023-04-17 13:18 UTC (permalink / raw)
  To: Stefan Berger, Christian Brauner, Amir Goldstein
  Cc: Paul Moore, zohar, linux-integrity, miklos, linux-kernel,
	linux-security-module, linux-fsdevel, linux-unionfs

On Mon, 2023-04-17 at 08:45 -0400, Stefan Berger wrote:
> 
> On 4/17/23 06:05, Jeff Layton wrote:
> > On Sun, 2023-04-16 at 21:57 -0400, Stefan Berger wrote:
> > > 
> > > On 4/7/23 09:29, Jeff Layton wrote:
> 
> > > > 
> > > > Note that there Stephen is correct that calling getattr is probably
> > > > going to be less efficient here since we're going to end up calling
> > > > generic_fillattr unnecessarily, but I still think it's the right thing
> > > > to do.
> > > 
> > > I was wondering whether to use the existing inode_eq_iversion() for all
> > > other filesystems than overlayfs, nfs, and possibly other ones (which ones?)
> > > where we would use the vfs_getattr_nosec() via a case on inode->i_sb->s_magic?
> > > If so, would this function be generic enough to be a public function for libfs.c?
> > > 
> > > I'll hopefully be able to test the proposed patch tomorrow.
> > > 
> > > 
> > 
> > No, you don't want to use inode_eq_iversion here because (as the comment
> > over it says):
> 
> In the ima_check_last_writer() case the usage of inode_eq_iversion() was correct since
> at this point no record of  its value was made and therefore no writer needed to change
> the i_value again due to IMA:
> 
> 		update = test_and_clear_bit(IMA_UPDATE_XATTR,
> 					    &iint->atomic_flags);
> 		if (!IS_I_VERSION(inode) ||
> 		    !inode_eq_iversion(inode, iint->version) ||
> 		    (iint->flags & IMA_NEW_FILE)) {
> 			iint->flags &= ~(IMA_DONE_MASK | IMA_NEW_FILE);
> 			iint->measured_pcrs = 0;
> 			if (update)
> 				ima_update_xattr(iint, file);
> 		}
> 
> The record of the value is only made when the actual measurement is done in
> ima_collect_measurement()
> 

True, but we don't have a generic mechanism to do a this. What you're
doing only works for IS_I_VERSION inodes.

> Compared to this the usage of vfs_getattr_nosec() is expensive since it resets the flag.
> 
>          if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) {
>                  stat->result_mask |= STATX_CHANGE_COOKIE;
>                  stat->change_cookie = inode_query_iversion(inode);
>          }
> 
> 	idmap = mnt_idmap(path->mnt);
> 	if (inode->i_op->getattr)
> 		return inode->i_op->getattr(idmap, path, stat,
> 					    request_mask, query_flags);
> 
> Also, many filesystems will have their getattr now called as well.
> 

...as they should!

> I understand Christian's argument about the maintenance headache to a certain degree...
> 

IMA is not equipped to understand the subtleties of how the i_version
counter is implemented on different filesystems. In the past it dealt
with this by limiting its usage to IS_I_VERSION inodes, but that is
already problematic today. For instance: xfs currently sets the
SB_I_VERSION flag, but its i_version counter also bumps the value on
atime updates. That means that IMA is doing more remeasurements on xfs
than are needed.

I'm trying to clean a lot of this up, but IMA's current usage isn't
really helping since it's poking around in areas it shouldn't be. Doing
a getattr is the canonical way to query this value since it leaves it up
to the filesystem how to report this value.

If this turns out to cause a performance regression we can look at
adding a getattr-like routine that _only_ reports the change attribute.
I wouldn't want to do that though unless the need were clear (and backed
up by performance numbers).
-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-06 22:04                     ` Jeff Layton
  2023-04-06 22:27                       ` Stefan Berger
  2023-04-07  8:31                       ` Christian Brauner
@ 2023-04-17 14:07                       ` Stefan Berger
  2 siblings, 0 replies; 61+ messages in thread
From: Stefan Berger @ 2023-04-17 14:07 UTC (permalink / raw)
  To: Jeff Layton, Christian Brauner, Paul Moore
  Cc: zohar, linux-integrity, miklos, linux-kernel,
	linux-security-module, linux-fsdevel, linux-unionfs, amir73il



On 4/6/23 18:04, Jeff Layton wrote:
> On Thu, 2023-04-06 at 17:24 -0400, Jeff Layton wrote:
>> On Thu, 2023-04-06 at 16:22 -0400, Stefan Berger wrote:
>>>
>>> On 4/6/23 15:37, Jeff Layton wrote:
>>>> On Thu, 2023-04-06 at 15:11 -0400, Stefan Berger wrote:
>>>>>
>>>>> On 4/6/23 14:46, Jeff Layton wrote:
>>>>>> On Thu, 2023-04-06 at 17:01 +0200, Christian Brauner wrote:
>>>>>>> On Thu, Apr 06, 2023 at 10:36:41AM -0400, Paul Moore wrote:
>>>>>
>>>>>>
>>>>>> Correct. As long as IMA is also measuring the upper inode then it seems
>>>>>> like you shouldn't need to do anything special here.
>>>>>
>>>>> Unfortunately IMA does not notice the changes. With the patch provided in the other email IMA works as expected.
>>>>>
>>>>
>>>>
>>>> It looks like remeasurement is usually done in ima_check_last_writer.
>>>> That gets called from __fput which is called when we're releasing the
>>>> last reference to the struct file.
>>>>
>>>> You've hooked into the ->release op, which gets called whenever
>>>> filp_close is called, which happens when we're disassociating the file
>>>> from the file descriptor table.
>>>>
>>>> So...I don't get it. Is ima_file_free not getting called on your file
>>>> for some reason when you go to close it? It seems like that should be
>>>> handling this.
>>>
>>> I would ditch the original proposal in favor of this 2-line patch shown here:
>>>
>>> https://lore.kernel.org/linux-integrity/a95f62ed-8b8a-38e5-e468-ecbde3b221af@linux.ibm.com/T/#m3bd047c6e5c8200df1d273c0ad551c645dd43232
>>>
>>>
>>
>> Ok, I think I get it. IMA is trying to use the i_version from the
>> overlayfs inode.
>>
>> I suspect that the real problem here is that IMA is just doing a bare
>> inode_query_iversion. Really, we ought to make IMA call
>> vfs_getattr_nosec (or something like it) to query the getattr routine in
>> the upper layer. Then overlayfs could just propagate the results from
>> the upper layer in its response.
>>
>> That sort of design may also eventually help IMA work properly with more
>> exotic filesystems, like NFS or Ceph.
>>
>>
>>
> 
> Maybe something like this? It builds for me but I haven't tested it. It
> looks like overlayfs already should report the upper layer's i_version
> in getattr, though I haven't tested that either:
> 
> -----------------------8<---------------------------
> 
> [PATCH] IMA: use vfs_getattr_nosec to get the i_version
> 
> IMA currently accesses the i_version out of the inode directly when it
> does a measurement. This is fine for most simple filesystems, but can be
> problematic with more complex setups (e.g. overlayfs).
> 
> Make IMA instead call vfs_getattr_nosec to get this info. This allows
> the filesystem to determine whether and how to report the i_version, and
> should allow IMA to work properly with a broader class of filesystems in
> the future.
> 
> Reported-by: Stefan Berger <stefanb@linux.ibm.com>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>   security/integrity/ima/ima_api.c  |  9 ++++++---
>   security/integrity/ima/ima_main.c | 12 ++++++++----
>   2 files changed, 14 insertions(+), 7 deletions(-)
> 
> diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c
> index d3662f4acadc..c45902e72044 100644
> --- a/security/integrity/ima/ima_api.c
> +++ b/security/integrity/ima/ima_api.c
> @@ -13,7 +13,6 @@
>   #include <linux/fs.h>
>   #include <linux/xattr.h>
>   #include <linux/evm.h>
> -#include <linux/iversion.h>
>   #include <linux/fsverity.h>
>   
>   #include "ima.h"
> @@ -246,10 +245,11 @@ int ima_collect_measurement(struct integrity_iint_cache *iint,
>   	struct inode *inode = file_inode(file);
>   	const char *filename = file->f_path.dentry->d_name.name;
>   	struct ima_max_digest_data hash;
> +	struct kstat stat;
>   	int result = 0;
>   	int length;
>   	void *tmpbuf;
> -	u64 i_version;
> +	u64 i_version = 0;
>   
>   	/*
>   	 * Always collect the modsig, because IMA might have already collected
> @@ -268,7 +268,10 @@ int ima_collect_measurement(struct integrity_iint_cache *iint,
>   	 * to an initial measurement/appraisal/audit, but was modified to
>   	 * assume the file changed.
>   	 */
> -	i_version = inode_query_iversion(inode);
> +	result = vfs_getattr_nosec(&file->f_path, &stat, STATX_CHANGE_COOKIE,
> +				   AT_STATX_SYNC_AS_STAT);
> +	if (!result && (stat.result_mask & STATX_CHANGE_COOKIE))
> +		i_version = stat.change_cookie;
>   	hash.hdr.algo = algo;
>   	hash.hdr.length = hash_digest_size[algo];
>   
> diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
> index d66a0a36415e..365db0e43d7c 100644
> --- a/security/integrity/ima/ima_main.c
> +++ b/security/integrity/ima/ima_main.c
> @@ -24,7 +24,6 @@
>   #include <linux/slab.h>
>   #include <linux/xattr.h>
>   #include <linux/ima.h>
> -#include <linux/iversion.h>
>   #include <linux/fs.h>
>   
>   #include "ima.h"
> @@ -164,11 +163,16 @@ static void ima_check_last_writer(struct integrity_iint_cache *iint,
>   
>   	mutex_lock(&iint->mutex);
>   	if (atomic_read(&inode->i_writecount) == 1) {
> +		struct kstat stat;
> +
>   		update = test_and_clear_bit(IMA_UPDATE_XATTR,
>   					    &iint->atomic_flags);
> -		if (!IS_I_VERSION(inode) ||
> -		    !inode_eq_iversion(inode, iint->version) ||
> -		    (iint->flags & IMA_NEW_FILE)) {
> +		if ((iint->flags & IMA_NEW_FILE) ||
> +		    vfs_getattr_nosec(&file->f_path, &stat,
> +				      STATX_CHANGE_COOKIE,
> +				      AT_STATX_SYNC_AS_STAT) ||
> +		    !(stat.result_mask & STATX_CHANGE_COOKIE) ||
> +		    stat.change_cookie != iint->version) {
>   			iint->flags &= ~(IMA_DONE_MASK | IMA_NEW_FILE);
>   			iint->measured_pcrs = 0;
>   			if (update)

I tested this in the OpenBMC setup with overlayfs acting as rootfs. It works now as expected.

Tested-by: Stefan Berger <stefanb@linux.ibm.com>


^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-07 13:29                         ` Jeff Layton
  2023-04-09 15:22                           ` Christian Brauner
  2023-04-17  1:57                           ` Stefan Berger
@ 2023-04-21 14:43                           ` Mimi Zohar
  2023-05-18 20:46                             ` Paul Moore
  2 siblings, 1 reply; 61+ messages in thread
From: Mimi Zohar @ 2023-04-21 14:43 UTC (permalink / raw)
  To: Jeff Layton, Christian Brauner, Amir Goldstein
  Cc: Stefan Berger, Paul Moore, linux-integrity, miklos, linux-kernel,
	linux-security-module, linux-fsdevel, linux-unionfs

On Fri, 2023-04-07 at 09:29 -0400, Jeff Layton wrote:
> > > > > 
> > > > > I would ditch the original proposal in favor of this 2-line patch shown here:
> > > > > 
> > > > > https://lore.kernel.org/linux-integrity/a95f62ed-8b8a-38e5-e468-ecbde3b221af@linux.ibm.com/T/#m3bd047c6e5c8200df1d273c0ad551c645dd43232
> > 
> > We should cool it with the quick hacks to fix things. :)
> > 
> 
> Yeah. It might fix this specific testcase, but I think the way it uses
> the i_version is "gameable" in other situations. Then again, I don't
> know a lot about IMA in this regard.
> 
> When is it expected to remeasure? If it's only expected to remeasure on
> a close(), then that's one thing. That would be a weird design though.

Historical background:

Prior to IMA being upstreamed there was a lot of discussion about how
much/how frequently to measure files.  Re-measuring files after each
write would impact performance.  Instead of re-measuring files after
each write, if a file already opened for write was opened for read
(open writers) or a file already opened for read was opened for write
(Time of Measure/Time of Use) the IMA meausrement list was invalidated
by including a violation record in the measurement list.

Only the BPRM hook prevents a file from being opened for write.

> 
> > > > > 
> > > > > 
> > > > 
> > > > Ok, I think I get it. IMA is trying to use the i_version from the
> > > > overlayfs inode.
> > > > 
> > > > I suspect that the real problem here is that IMA is just doing a bare
> > > > inode_query_iversion. Really, we ought to make IMA call
> > > > vfs_getattr_nosec (or something like it) to query the getattr routine in
> > > > the upper layer. Then overlayfs could just propagate the results from
> > > > the upper layer in its response.
> > > > 
> > > > That sort of design may also eventually help IMA work properly with more
> > > > exotic filesystems, like NFS or Ceph.
> > > > 
> > > > 
> > > > 
> > > 
> > > Maybe something like this? It builds for me but I haven't tested it. It
> > > looks like overlayfs already should report the upper layer's i_version
> > > in getattr, though I haven't tested that either:
> > > 
> > > -----------------------8<---------------------------
> > > 
> > > [PATCH] IMA: use vfs_getattr_nosec to get the i_version
> > > 
> > > IMA currently accesses the i_version out of the inode directly when it
> > > does a measurement. This is fine for most simple filesystems, but can be
> > > problematic with more complex setups (e.g. overlayfs).
> > > 
> > > Make IMA instead call vfs_getattr_nosec to get this info. This allows
> > > the filesystem to determine whether and how to report the i_version, and
> > > should allow IMA to work properly with a broader class of filesystems in
> > > the future.
> > > 
> > > Reported-by: Stefan Berger <stefanb@linux.ibm.com>
> > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > ---
> > 
> > So, I think we want both; we want the ovl_copyattr() and the
> > vfs_getattr_nosec() change:
> > 
> > (1) overlayfs should copy up the inode version in ovl_copyattr(). That
> >     is in line what we do with all other inode attributes. IOW, the
> >     overlayfs inode's i_version counter should aim to mirror the
> >     relevant layer's i_version counter. I wouldn't know why that
> >     shouldn't be the case. Asking the other way around there doesn't
> >     seem to be any use for overlayfs inodes to have an i_version that
> >     isn't just mirroring the relevant layer's i_version.
> 
> It's less than ideal to do this IMO, particularly with an IS_I_VERSION
> inode.
> 
> You can't just copy up the value from the upper. You'll need to call
> inode_query_iversion(upper_inode), which will flag the upper inode for a
> logged i_version update on the next write. IOW, this could create some
> (probably minor) metadata write amplification in the upper layer inode
> with IS_I_VERSION inodes.
> 
> 
> > (2) Jeff's changes for ima to make it rely on vfs_getattr_nosec().
> >     Currently, ima assumes that it will get the correct i_version from
> >     an inode but that just doesn't hold for stacking filesystem.
> > 
> > While (1) would likely just fix the immediate bug (2) is correct and
> > _robust_. If we change how attributes are handled vfs_*() helpers will
> > get updated and ima with it. Poking at raw inodes without using
> > appropriate helpers is much more likely to get ima into trouble.
> 
> This will fix it the right way, I think (assuming it actually works),
> and should open the door for IMA to work properly with networked
> filesystems that support i_version as well.

On a local filesystem, there are guarantees that the calculated file
hash is that of the file being used.  Reminder IMA reads a file, page
size chunk at a time into a single buffer, calculating the file hash. 
Once the file hash is calculated, the memory is freed.

There are no guarantees on a fuse filesystem, for example, that the
original file read and verified is the same as the one being executed. 
I'm not sure that the integrity guarantees of a file on a remote
filesystem will be the same as those on a local file system.

> 
> Note that there Stephen is correct that calling getattr is probably
> going to be less efficient here since we're going to end up calling
> generic_fillattr unnecessarily, but I still think it's the right thing
> to do.
> 
> If it turns out to cause measurable performance regressions though,
> maybe we can look at adding a something that still calls ->getattr if it
> exists but only returns the change_cookie value.

Sure.  For now,

Reviewed-by: Mimi Zohar <zohar@linux.ibm.com>


^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-11  8:38                               ` Christian Brauner
  2023-04-11  9:32                                 ` Jeff Layton
@ 2023-04-21 14:55                                 ` Mimi Zohar
  1 sibling, 0 replies; 61+ messages in thread
From: Mimi Zohar @ 2023-04-21 14:55 UTC (permalink / raw)
  To: Christian Brauner, Jeff Layton
  Cc: Amir Goldstein, Stefan Berger, Paul Moore, linux-integrity,
	miklos, linux-kernel, linux-security-module, linux-fsdevel,
	linux-unionfs

On Tue, 2023-04-11 at 10:38 +0200, Christian Brauner wrote:
> On Sun, Apr 09, 2023 at 06:12:09PM -0400, Jeff Layton wrote:
> > On Sun, 2023-04-09 at 17:22 +0200, Christian Brauner wrote:
> > > On Fri, Apr 07, 2023 at 09:29:29AM -0400, Jeff Layton wrote:
> > > > > > > > 
> > > > > > > > I would ditch the original proposal in favor of this 2-line patch shown here:
> > > > > > > > 
> > > > > > > > https://lore.kernel.org/linux-integrity/a95f62ed-8b8a-38e5-e468-ecbde3b221af@linux.ibm.com/T/#m3bd047c6e5c8200df1d273c0ad551c645dd43232
> > > > > 
> > > > > We should cool it with the quick hacks to fix things. :)
> > > > > 
> > > > 
> > > > Yeah. It might fix this specific testcase, but I think the way it uses
> > > > the i_version is "gameable" in other situations. Then again, I don't
> > > > know a lot about IMA in this regard.
> > > > 
> > > > When is it expected to remeasure? If it's only expected to remeasure on
> > > > a close(), then that's one thing. That would be a weird design though.
> > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > 
> > > > > > > Ok, I think I get it. IMA is trying to use the i_version from the
> > > > > > > overlayfs inode.
> > > > > > > 
> > > > > > > I suspect that the real problem here is that IMA is just doing a bare
> > > > > > > inode_query_iversion. Really, we ought to make IMA call
> > > > > > > vfs_getattr_nosec (or something like it) to query the getattr routine in
> > > > > > > the upper layer. Then overlayfs could just propagate the results from
> > > > > > > the upper layer in its response.
> > > > > > > 
> > > > > > > That sort of design may also eventually help IMA work properly with more
> > > > > > > exotic filesystems, like NFS or Ceph.
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > 
> > > > > > Maybe something like this? It builds for me but I haven't tested it. It
> > > > > > looks like overlayfs already should report the upper layer's i_version
> > > > > > in getattr, though I haven't tested that either:
> > > > > > 
> > > > > > -----------------------8<---------------------------
> > > > > > 
> > > > > > [PATCH] IMA: use vfs_getattr_nosec to get the i_version
> > > > > > 
> > > > > > IMA currently accesses the i_version out of the inode directly when it
> > > > > > does a measurement. This is fine for most simple filesystems, but can be
> > > > > > problematic with more complex setups (e.g. overlayfs).
> > > > > > 
> > > > > > Make IMA instead call vfs_getattr_nosec to get this info. This allows
> > > > > > the filesystem to determine whether and how to report the i_version, and
> > > > > > should allow IMA to work properly with a broader class of filesystems in
> > > > > > the future.
> > > > > > 
> > > > > > Reported-by: Stefan Berger <stefanb@linux.ibm.com>
> > > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > > > ---
> > > > > 
> > > > > So, I think we want both; we want the ovl_copyattr() and the
> > > > > vfs_getattr_nosec() change:
> > > > > 
> > > > > (1) overlayfs should copy up the inode version in ovl_copyattr(). That
> > > > >     is in line what we do with all other inode attributes. IOW, the
> > > > >     overlayfs inode's i_version counter should aim to mirror the
> > > > >     relevant layer's i_version counter. I wouldn't know why that
> > > > >     shouldn't be the case. Asking the other way around there doesn't
> > > > >     seem to be any use for overlayfs inodes to have an i_version that
> > > > >     isn't just mirroring the relevant layer's i_version.
> > > > 
> > > > It's less than ideal to do this IMO, particularly with an IS_I_VERSION
> > > > inode.
> > > > 
> > > > You can't just copy up the value from the upper. You'll need to call
> > > > inode_query_iversion(upper_inode), which will flag the upper inode for a
> > > > logged i_version update on the next write. IOW, this could create some
> > > > (probably minor) metadata write amplification in the upper layer inode
> > > > with IS_I_VERSION inodes.
> > > 
> > > I'm likely just missing context and am curious about this so bear with me. Why
> > > do we need to flag the upper inode for a logged i_version update? Any required
> > > i_version interactions should've already happened when overlayfs called into
> > > the upper layer. So all that's left to do is for overlayfs' to mirror the
> > > i_version value after the upper operation has returned.
> > 
> > > ovl_copyattr() - which copies the inode attributes - is always called after the
> > > operation on the upper inode has finished. So the additional query seems odd at
> > > first glance. But there might well be a good reason for it. In my naive
> > > approach I would've thought that sm along the lines of:
> > >
> > > diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
> > > index 923d66d131c1..8b089035b9b3 100644
> > > --- a/fs/overlayfs/util.c
> > > +++ b/fs/overlayfs/util.c
> > > @@ -1119,4 +1119,5 @@ void ovl_copyattr(struct inode *inode)
> > >         inode->i_mtime = realinode->i_mtime;
> > >         inode->i_ctime = realinode->i_ctime;
> > >         i_size_write(inode, i_size_read(realinode));
> > > +       inode_set_iversion_raw(inode, inode_peek_iversion_raw(realinode));
> > >  }
> > > 
> > > would've been sufficient.
> > > 
> > 
> > Nope, because then you wouldn't get any updates to i_version after that
> > point.
> > 
> > Note that with an IS_I_VERSION inode we only update the i_version when
> > there has been a query since the last update. What you're doing above is
> > circumventing that mechanism. You'll get the i_version at the time of of
> > the ovl_copyattr, but there won't be any updates of it after that point
> > because the QUERIED bit won't end up being set on realinode.
> 
> I get all that.
> But my understanding had been that the i_version value at the time of
> ovl_copyattr() would be correct. Because when ovl_copyattr() is called
> the expected i_version change will have been done in the relevant layer
> includig raising the QUERIED bit. Since the layers are not allowed to be
> changed outside of the overlayfs mount any change to them can only
> originate from overlayfs which would necessarily call ovl_copyattr()
> again. IOW, overlayfs would by virtue of its implementation keep the
> i_version value in sync.
> 
> Overlayfs wouldn't even raise SB_I_VERSION. It would indeed just be a
> cache of i_version of the relevant layer.
> 
> > 
> > 
> > > Since overlayfs' does explicitly disallow changes to the upper and lower trees
> > > while overlayfs is mounted it seems intuitive that it should just mirror the
> > > relevant layer's i_version.
> > >
> > >
> > > If we don't do this, then we should probably document that i_version doesn't
> > > have a meaning yet for the inodes of stacking filesystems.
> > > 
> > 
> > Trying to cache the i_version is counterproductive, IMO, at least with
> > an IS_I_VERSION inode.
> > 
> > The problem is that a query against the i_version has a side-effect. It
> > has to (atomically) mark the inode for an update on the next change.
> > 
> > If you try to cache that value, you'll likely end up doing more queries
> > than you really need to (because you'll need to keep the cache up to
> > date) and you'll have an i_version that will necessarily lag the one in
> > the upper layer inode.
> > 
> > The whole point of the change attribute is to get the value as it is at
> > this very moment so we can check whether there have been changes. A
> > laggy value is not terribly useful.
> > 
> > Overlayfs should just always call the upper layer's ->getattr to get the
> > version. I wouldn't even bother copying it up in the first place. Doing
> > so is just encouraging someone to try use the value in the overlayfs
> > inode, when they really need to go through ->getattr and get the one
> > from the upper layer.
> 
> That seems reasonable to me. I read this as an agreeing with my earlier
> suggestion to document that i_version doesn't have a meaning for the
> inodes of stacking filesystems and that we should spell out that
> vfs_getattr()/->getattr() needs to be used to interact with i_version.
> 
> We need to explain to subsystems such as IMA somwhere what the correct
> way to query i_version agnostically is; independent of filesystem
> implementation details.
> 
> Looking at IMA, it queries the i_version directly without checking
> whether it's an IS_I_VERSION() inode first. This might make a
> difference.h
> 
> Afaict, filesystems that persist i_version to disk automatically raise
> SB_I_VERSION. I would guess that it be considered a bug if a filesystem
> would persist i_version to disk and not raise SB_I_VERSION. If so IMA
> should probably be made to check for IS_I_VERSION() and it will probably
> get that by switching to vfs_getattr_nosec().

When the filesystem isn't mounted with I_VERSION, i_version should be
set to 0.

Originally when the filesytem wasn't mounted with I_VERSION support,
the file would only be measured once.  With commit ac0bf025d2c0 ("ima:
Use i_version only when filesystem supports it"), this changed.   The
"iint" flags are reset, causing the file to be re-
{measure/appraised/audited} on next access.

-- 
thanks,

Mimi


^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-21 14:43                           ` Mimi Zohar
@ 2023-05-18 20:46                             ` Paul Moore
  2023-05-18 20:50                               ` Mimi Zohar
  0 siblings, 1 reply; 61+ messages in thread
From: Paul Moore @ 2023-05-18 20:46 UTC (permalink / raw)
  To: Mimi Zohar, Stefan Berger
  Cc: Jeff Layton, Christian Brauner, Amir Goldstein, linux-integrity,
	miklos, linux-kernel, linux-security-module, linux-fsdevel,
	linux-unionfs

On Fri, Apr 21, 2023 at 10:44 AM Mimi Zohar <zohar@linux.ibm.com> wrote:
> On Fri, 2023-04-07 at 09:29 -0400, Jeff Layton wrote:
> > > > > >
> > > > > > I would ditch the original proposal in favor of this 2-line patch shown here:
> > > > > >
> > > > > > https://lore.kernel.org/linux-integrity/a95f62ed-8b8a-38e5-e468-ecbde3b221af@linux.ibm.com/T/#m3bd047c6e5c8200df1d273c0ad551c645dd43232
> > >
> > > We should cool it with the quick hacks to fix things. :)
> > >
> >
> > Yeah. It might fix this specific testcase, but I think the way it uses
> > the i_version is "gameable" in other situations. Then again, I don't
> > know a lot about IMA in this regard.
> >
> > When is it expected to remeasure? If it's only expected to remeasure on
> > a close(), then that's one thing. That would be a weird design though.
>
> Historical background:
>
> Prior to IMA being upstreamed there was a lot of discussion about how
> much/how frequently to measure files.  Re-measuring files after each
> write would impact performance.  Instead of re-measuring files after
> each write, if a file already opened for write was opened for read
> (open writers) or a file already opened for read was opened for write
> (Time of Measure/Time of Use) the IMA meausrement list was invalidated
> by including a violation record in the measurement list.
>
> Only the BPRM hook prevents a file from being opened for write.
>
> >
> > > > > >
> > > > > >
> > > > >
> > > > > Ok, I think I get it. IMA is trying to use the i_version from the
> > > > > overlayfs inode.
> > > > >
> > > > > I suspect that the real problem here is that IMA is just doing a bare
> > > > > inode_query_iversion. Really, we ought to make IMA call
> > > > > vfs_getattr_nosec (or something like it) to query the getattr routine in
> > > > > the upper layer. Then overlayfs could just propagate the results from
> > > > > the upper layer in its response.
> > > > >
> > > > > That sort of design may also eventually help IMA work properly with more
> > > > > exotic filesystems, like NFS or Ceph.
> > > > >
> > > > >
> > > > >
> > > >
> > > > Maybe something like this? It builds for me but I haven't tested it. It
> > > > looks like overlayfs already should report the upper layer's i_version
> > > > in getattr, though I haven't tested that either:
> > > >
> > > > -----------------------8<---------------------------
> > > >
> > > > [PATCH] IMA: use vfs_getattr_nosec to get the i_version
> > > >
> > > > IMA currently accesses the i_version out of the inode directly when it
> > > > does a measurement. This is fine for most simple filesystems, but can be
> > > > problematic with more complex setups (e.g. overlayfs).
> > > >
> > > > Make IMA instead call vfs_getattr_nosec to get this info. This allows
> > > > the filesystem to determine whether and how to report the i_version, and
> > > > should allow IMA to work properly with a broader class of filesystems in
> > > > the future.
> > > >
> > > > Reported-by: Stefan Berger <stefanb@linux.ibm.com>
> > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > ---
> > >
> > > So, I think we want both; we want the ovl_copyattr() and the
> > > vfs_getattr_nosec() change:
> > >
> > > (1) overlayfs should copy up the inode version in ovl_copyattr(). That
> > >     is in line what we do with all other inode attributes. IOW, the
> > >     overlayfs inode's i_version counter should aim to mirror the
> > >     relevant layer's i_version counter. I wouldn't know why that
> > >     shouldn't be the case. Asking the other way around there doesn't
> > >     seem to be any use for overlayfs inodes to have an i_version that
> > >     isn't just mirroring the relevant layer's i_version.
> >
> > It's less than ideal to do this IMO, particularly with an IS_I_VERSION
> > inode.
> >
> > You can't just copy up the value from the upper. You'll need to call
> > inode_query_iversion(upper_inode), which will flag the upper inode for a
> > logged i_version update on the next write. IOW, this could create some
> > (probably minor) metadata write amplification in the upper layer inode
> > with IS_I_VERSION inodes.
> >
> >
> > > (2) Jeff's changes for ima to make it rely on vfs_getattr_nosec().
> > >     Currently, ima assumes that it will get the correct i_version from
> > >     an inode but that just doesn't hold for stacking filesystem.
> > >
> > > While (1) would likely just fix the immediate bug (2) is correct and
> > > _robust_. If we change how attributes are handled vfs_*() helpers will
> > > get updated and ima with it. Poking at raw inodes without using
> > > appropriate helpers is much more likely to get ima into trouble.
> >
> > This will fix it the right way, I think (assuming it actually works),
> > and should open the door for IMA to work properly with networked
> > filesystems that support i_version as well.
>
> On a local filesystem, there are guarantees that the calculated file
> hash is that of the file being used.  Reminder IMA reads a file, page
> size chunk at a time into a single buffer, calculating the file hash.
> Once the file hash is calculated, the memory is freed.
>
> There are no guarantees on a fuse filesystem, for example, that the
> original file read and verified is the same as the one being executed.
> I'm not sure that the integrity guarantees of a file on a remote
> filesystem will be the same as those on a local file system.
>
> >
> > Note that there Stephen is correct that calling getattr is probably
> > going to be less efficient here since we're going to end up calling
> > generic_fillattr unnecessarily, but I still think it's the right thing
> > to do.
> >
> > If it turns out to cause measurable performance regressions though,
> > maybe we can look at adding a something that still calls ->getattr if it
> > exists but only returns the change_cookie value.
>
> Sure.  For now,
>
> Reviewed-by: Mimi Zohar <zohar@linux.ibm.com>

I'm going through my review queue to make sure I haven't missed
anything and this thread popped up ... Stefan, Mimi, did you get a fix
into an upstream tree somewhere?  If not, is it because you are
waiting on a review/merge from me into the LSM tree?

-- 
paul-moore.com

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-05-18 20:46                             ` Paul Moore
@ 2023-05-18 20:50                               ` Mimi Zohar
  2023-05-19 14:58                                 ` Paul Moore
  0 siblings, 1 reply; 61+ messages in thread
From: Mimi Zohar @ 2023-05-18 20:50 UTC (permalink / raw)
  To: Paul Moore, Stefan Berger
  Cc: Jeff Layton, Christian Brauner, Amir Goldstein, linux-integrity,
	miklos, linux-kernel, linux-security-module, linux-fsdevel,
	linux-unionfs

On Thu, 2023-05-18 at 16:46 -0400, Paul Moore wrote:
> On Fri, Apr 21, 2023 at 10:44 AM Mimi Zohar <zohar@linux.ibm.com> wrote:
> > On Fri, 2023-04-07 at 09:29 -0400, Jeff Layton wrote:
> > > > > > >
> > > > > > > I would ditch the original proposal in favor of this 2-line patch shown here:
> > > > > > >
> > > > > > > https://lore.kernel.org/linux-integrity/a95f62ed-8b8a-38e5-e468-ecbde3b221af@linux.ibm.com/T/#m3bd047c6e5c8200df1d273c0ad551c645dd43232
> > > >
> > > > We should cool it with the quick hacks to fix things. :)
> > > >
> > >
> > > Yeah. It might fix this specific testcase, but I think the way it uses
> > > the i_version is "gameable" in other situations. Then again, I don't
> > > know a lot about IMA in this regard.
> > >
> > > When is it expected to remeasure? If it's only expected to remeasure on
> > > a close(), then that's one thing. That would be a weird design though.
> >
> > Historical background:
> >
> > Prior to IMA being upstreamed there was a lot of discussion about how
> > much/how frequently to measure files.  Re-measuring files after each
> > write would impact performance.  Instead of re-measuring files after
> > each write, if a file already opened for write was opened for read
> > (open writers) or a file already opened for read was opened for write
> > (Time of Measure/Time of Use) the IMA meausrement list was invalidated
> > by including a violation record in the measurement list.
> >
> > Only the BPRM hook prevents a file from being opened for write.
> >
> > >
> > > > > > >
> > > > > > >
> > > > > >
> > > > > > Ok, I think I get it. IMA is trying to use the i_version from the
> > > > > > overlayfs inode.
> > > > > >
> > > > > > I suspect that the real problem here is that IMA is just doing a bare
> > > > > > inode_query_iversion. Really, we ought to make IMA call
> > > > > > vfs_getattr_nosec (or something like it) to query the getattr routine in
> > > > > > the upper layer. Then overlayfs could just propagate the results from
> > > > > > the upper layer in its response.
> > > > > >
> > > > > > That sort of design may also eventually help IMA work properly with more
> > > > > > exotic filesystems, like NFS or Ceph.
> > > > > >
> > > > > >
> > > > > >
> > > > >
> > > > > Maybe something like this? It builds for me but I haven't tested it. It
> > > > > looks like overlayfs already should report the upper layer's i_version
> > > > > in getattr, though I haven't tested that either:
> > > > >
> > > > > -----------------------8<---------------------------
> > > > >
> > > > > [PATCH] IMA: use vfs_getattr_nosec to get the i_version
> > > > >
> > > > > IMA currently accesses the i_version out of the inode directly when it
> > > > > does a measurement. This is fine for most simple filesystems, but can be
> > > > > problematic with more complex setups (e.g. overlayfs).
> > > > >
> > > > > Make IMA instead call vfs_getattr_nosec to get this info. This allows
> > > > > the filesystem to determine whether and how to report the i_version, and
> > > > > should allow IMA to work properly with a broader class of filesystems in
> > > > > the future.
> > > > >
> > > > > Reported-by: Stefan Berger <stefanb@linux.ibm.com>
> > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > > ---
> > > >
> > > > So, I think we want both; we want the ovl_copyattr() and the
> > > > vfs_getattr_nosec() change:
> > > >
> > > > (1) overlayfs should copy up the inode version in ovl_copyattr(). That
> > > >     is in line what we do with all other inode attributes. IOW, the
> > > >     overlayfs inode's i_version counter should aim to mirror the
> > > >     relevant layer's i_version counter. I wouldn't know why that
> > > >     shouldn't be the case. Asking the other way around there doesn't
> > > >     seem to be any use for overlayfs inodes to have an i_version that
> > > >     isn't just mirroring the relevant layer's i_version.
> > >
> > > It's less than ideal to do this IMO, particularly with an IS_I_VERSION
> > > inode.
> > >
> > > You can't just copy up the value from the upper. You'll need to call
> > > inode_query_iversion(upper_inode), which will flag the upper inode for a
> > > logged i_version update on the next write. IOW, this could create some
> > > (probably minor) metadata write amplification in the upper layer inode
> > > with IS_I_VERSION inodes.
> > >
> > >
> > > > (2) Jeff's changes for ima to make it rely on vfs_getattr_nosec().
> > > >     Currently, ima assumes that it will get the correct i_version from
> > > >     an inode but that just doesn't hold for stacking filesystem.
> > > >
> > > > While (1) would likely just fix the immediate bug (2) is correct and
> > > > _robust_. If we change how attributes are handled vfs_*() helpers will
> > > > get updated and ima with it. Poking at raw inodes without using
> > > > appropriate helpers is much more likely to get ima into trouble.
> > >
> > > This will fix it the right way, I think (assuming it actually works),
> > > and should open the door for IMA to work properly with networked
> > > filesystems that support i_version as well.
> >
> > On a local filesystem, there are guarantees that the calculated file
> > hash is that of the file being used.  Reminder IMA reads a file, page
> > size chunk at a time into a single buffer, calculating the file hash.
> > Once the file hash is calculated, the memory is freed.
> >
> > There are no guarantees on a fuse filesystem, for example, that the
> > original file read and verified is the same as the one being executed.
> > I'm not sure that the integrity guarantees of a file on a remote
> > filesystem will be the same as those on a local file system.
> >
> > >
> > > Note that there Stephen is correct that calling getattr is probably
> > > going to be less efficient here since we're going to end up calling
> > > generic_fillattr unnecessarily, but I still think it's the right thing
> > > to do.
> > >
> > > If it turns out to cause measurable performance regressions though,
> > > maybe we can look at adding a something that still calls ->getattr if it
> > > exists but only returns the change_cookie value.
> >
> > Sure.  For now,
> >
> > Reviewed-by: Mimi Zohar <zohar@linux.ibm.com>
> 
> I'm going through my review queue to make sure I haven't missed
> anything and this thread popped up ... Stefan, Mimi, did you get a fix
> into an upstream tree somewhere?  If not, is it because you are
> waiting on a review/merge from me into the LSM tree?

Sorry for the delay.  Between vacation and LSS, I just started testing
Jeff Layton's patch.

Mimi


^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-05-18 20:50                               ` Mimi Zohar
@ 2023-05-19 14:58                                 ` Paul Moore
  2023-05-25 14:43                                   ` Mimi Zohar
  0 siblings, 1 reply; 61+ messages in thread
From: Paul Moore @ 2023-05-19 14:58 UTC (permalink / raw)
  To: Mimi Zohar
  Cc: Stefan Berger, Jeff Layton, Christian Brauner, Amir Goldstein,
	linux-integrity, miklos, linux-kernel, linux-security-module,
	linux-fsdevel, linux-unionfs

On Thu, May 18, 2023 at 4:56 PM Mimi Zohar <zohar@linux.ibm.com> wrote:
> On Thu, 2023-05-18 at 16:46 -0400, Paul Moore wrote:
> > On Fri, Apr 21, 2023 at 10:44 AM Mimi Zohar <zohar@linux.ibm.com> wrote:
> > > On Fri, 2023-04-07 at 09:29 -0400, Jeff Layton wrote:

...

> > I'm going through my review queue to make sure I haven't missed
> > anything and this thread popped up ... Stefan, Mimi, did you get a fix
> > into an upstream tree somewhere?  If not, is it because you are
> > waiting on a review/merge from me into the LSM tree?
>
> Sorry for the delay.  Between vacation and LSS, I just started testing
> Jeff Layton's patch.

No worries, I'm a bit behind too, I just wanted to make sure I wasn't
blocking this thread :)

-- 
paul-moore.com

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-04-07  8:31                       ` Christian Brauner
  2023-04-07 13:29                         ` Jeff Layton
@ 2023-05-19 19:42                         ` Mimi Zohar
  2023-05-20  9:15                           ` Amir Goldstein
  2023-05-20  9:17                           ` Christian Brauner
  1 sibling, 2 replies; 61+ messages in thread
From: Mimi Zohar @ 2023-05-19 19:42 UTC (permalink / raw)
  To: Christian Brauner, Amir Goldstein, Jeff Layton
  Cc: Stefan Berger, Paul Moore, linux-integrity, miklos, linux-kernel,
	linux-security-module, linux-fsdevel, linux-unionfs,
	Ignaz Forster, Petr Vorel

On Fri, 2023-04-07 at 10:31 +0200, Christian Brauner wrote:
> So, I think we want both; we want the ovl_copyattr() and the
> vfs_getattr_nosec() change:
> 
> (1) overlayfs should copy up the inode version in ovl_copyattr(). That
>     is in line what we do with all other inode attributes. IOW, the
>     overlayfs inode's i_version counter should aim to mirror the
>     relevant layer's i_version counter. I wouldn't know why that
>     shouldn't be the case. Asking the other way around there doesn't
>     seem to be any use for overlayfs inodes to have an i_version that
>     isn't just mirroring the relevant layer's i_version.
> (2) Jeff's changes for ima to make it rely on vfs_getattr_nosec().
>     Currently, ima assumes that it will get the correct i_version from
>     an inode but that just doesn't hold for stacking filesystem.
> 
> While (1) would likely just fix the immediate bug (2) is correct and
> _robust_. If we change how attributes are handled vfs_*() helpers will
> get updated and ima with it. Poking at raw inodes without using
> appropriate helpers is much more likely to get ima into trouble.

In addition to properly setting the i_version for IMA, EVM has a
similar issue with i_generation and s_uuid. Adding them to
ovl_copyattr() seems to resolve it.   Does that make sense?

diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 923d66d131c1..cd0aeb828868 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -1118,5 +1118,8 @@ void ovl_copyattr(struct inode *inode)
 	inode->i_atime = realinode->i_atime;
 	inode->i_mtime = realinode->i_mtime;
 	inode->i_ctime = realinode->i_ctime;
+	inode->i_generation = realinode->i_generation;
+	if (inode->i_sb)
+		uuid_copy(&inode->i_sb->s_uuid, &realinode->i_sb-
>s_uuid);
 	i_size_write(inode, i_size_read(realinode));
 }
-- 
thanks,

Mimib


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-05-19 19:42                         ` Mimi Zohar
@ 2023-05-20  9:15                           ` Amir Goldstein
  2023-05-22 12:18                             ` Mimi Zohar
  2023-05-20  9:17                           ` Christian Brauner
  1 sibling, 1 reply; 61+ messages in thread
From: Amir Goldstein @ 2023-05-20  9:15 UTC (permalink / raw)
  To: Mimi Zohar
  Cc: Christian Brauner, Jeff Layton, Stefan Berger, Paul Moore,
	linux-integrity, miklos, linux-kernel, linux-security-module,
	linux-fsdevel, linux-unionfs, Ignaz Forster, Petr Vorel

On Fri, May 19, 2023 at 10:42 PM Mimi Zohar <zohar@linux.ibm.com> wrote:
>
> On Fri, 2023-04-07 at 10:31 +0200, Christian Brauner wrote:
> > So, I think we want both; we want the ovl_copyattr() and the
> > vfs_getattr_nosec() change:
> >
> > (1) overlayfs should copy up the inode version in ovl_copyattr(). That
> >     is in line what we do with all other inode attributes. IOW, the
> >     overlayfs inode's i_version counter should aim to mirror the
> >     relevant layer's i_version counter. I wouldn't know why that
> >     shouldn't be the case. Asking the other way around there doesn't
> >     seem to be any use for overlayfs inodes to have an i_version that
> >     isn't just mirroring the relevant layer's i_version.
> > (2) Jeff's changes for ima to make it rely on vfs_getattr_nosec().
> >     Currently, ima assumes that it will get the correct i_version from
> >     an inode but that just doesn't hold for stacking filesystem.
> >
> > While (1) would likely just fix the immediate bug (2) is correct and
> > _robust_. If we change how attributes are handled vfs_*() helpers will
> > get updated and ima with it. Poking at raw inodes without using
> > appropriate helpers is much more likely to get ima into trouble.
>
> In addition to properly setting the i_version for IMA, EVM has a
> similar issue with i_generation and s_uuid. Adding them to
> ovl_copyattr() seems to resolve it.   Does that make sense?
>
> diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
> index 923d66d131c1..cd0aeb828868 100644
> --- a/fs/overlayfs/util.c
> +++ b/fs/overlayfs/util.c
> @@ -1118,5 +1118,8 @@ void ovl_copyattr(struct inode *inode)
>         inode->i_atime = realinode->i_atime;
>         inode->i_mtime = realinode->i_mtime;
>         inode->i_ctime = realinode->i_ctime;
> +       inode->i_generation = realinode->i_generation;
> +       if (inode->i_sb)
> +               uuid_copy(&inode->i_sb->s_uuid, &realinode->i_sb-
> >s_uuid);

That is not a possible solution Mimi.

The i_gneration copy *may* be acceptable in "all layers on same fs"
setup, but changing overlayfs s_uuid over and over is a non-starter.

If you explain the problem, I may be able to help you find a better solution.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-05-19 19:42                         ` Mimi Zohar
  2023-05-20  9:15                           ` Amir Goldstein
@ 2023-05-20  9:17                           ` Christian Brauner
  2023-05-21 22:49                             ` Dave Chinner
  1 sibling, 1 reply; 61+ messages in thread
From: Christian Brauner @ 2023-05-20  9:17 UTC (permalink / raw)
  To: Mimi Zohar
  Cc: Amir Goldstein, Jeff Layton, Stefan Berger, Paul Moore,
	linux-integrity, miklos, linux-kernel, linux-security-module,
	linux-fsdevel, linux-unionfs, Ignaz Forster, Petr Vorel

On Fri, May 19, 2023 at 03:42:38PM -0400, Mimi Zohar wrote:
> On Fri, 2023-04-07 at 10:31 +0200, Christian Brauner wrote:
> > So, I think we want both; we want the ovl_copyattr() and the
> > vfs_getattr_nosec() change:
> > 
> > (1) overlayfs should copy up the inode version in ovl_copyattr(). That
> >     is in line what we do with all other inode attributes. IOW, the
> >     overlayfs inode's i_version counter should aim to mirror the
> >     relevant layer's i_version counter. I wouldn't know why that
> >     shouldn't be the case. Asking the other way around there doesn't
> >     seem to be any use for overlayfs inodes to have an i_version that
> >     isn't just mirroring the relevant layer's i_version.
> > (2) Jeff's changes for ima to make it rely on vfs_getattr_nosec().
> >     Currently, ima assumes that it will get the correct i_version from
> >     an inode but that just doesn't hold for stacking filesystem.
> > 
> > While (1) would likely just fix the immediate bug (2) is correct and
> > _robust_. If we change how attributes are handled vfs_*() helpers will
> > get updated and ima with it. Poking at raw inodes without using
> > appropriate helpers is much more likely to get ima into trouble.
> 
> In addition to properly setting the i_version for IMA, EVM has a
> similar issue with i_generation and s_uuid. Adding them to
> ovl_copyattr() seems to resolve it.   Does that make sense?
> 
> diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
> index 923d66d131c1..cd0aeb828868 100644
> --- a/fs/overlayfs/util.c
> +++ b/fs/overlayfs/util.c
> @@ -1118,5 +1118,8 @@ void ovl_copyattr(struct inode *inode)
>  	inode->i_atime = realinode->i_atime;
>  	inode->i_mtime = realinode->i_mtime;
>  	inode->i_ctime = realinode->i_ctime;
> +	inode->i_generation = realinode->i_generation;
> +	if (inode->i_sb)
> +		uuid_copy(&inode->i_sb->s_uuid, &realinode->i_sb-

Overlayfs can consist of multiple lower layers and each of those lower
layers may have a different uuid. So everytime you trigger a
ovl_copyattr() on a different layer this patch would alter the uuid of
the overlayfs superblock.

In addition the uuid should be set when the filesystem is mounted.
Unless the filesystem implements a dedicated ioctl() - like ext4 - to
change the uuid.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-05-20  9:17                           ` Christian Brauner
@ 2023-05-21 22:49                             ` Dave Chinner
  2023-05-22 10:50                               ` uuid ioctl - was: " Christian Brauner
  2023-05-23 17:35                               ` Mimi Zohar
  0 siblings, 2 replies; 61+ messages in thread
From: Dave Chinner @ 2023-05-21 22:49 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Mimi Zohar, Amir Goldstein, Jeff Layton, Stefan Berger,
	Paul Moore, linux-integrity, miklos, linux-kernel,
	linux-security-module, linux-fsdevel, linux-unionfs,
	Ignaz Forster, Petr Vorel

On Sat, May 20, 2023 at 11:17:35AM +0200, Christian Brauner wrote:
> On Fri, May 19, 2023 at 03:42:38PM -0400, Mimi Zohar wrote:
> > On Fri, 2023-04-07 at 10:31 +0200, Christian Brauner wrote:
> > > So, I think we want both; we want the ovl_copyattr() and the
> > > vfs_getattr_nosec() change:
> > > 
> > > (1) overlayfs should copy up the inode version in ovl_copyattr(). That
> > >     is in line what we do with all other inode attributes. IOW, the
> > >     overlayfs inode's i_version counter should aim to mirror the
> > >     relevant layer's i_version counter. I wouldn't know why that
> > >     shouldn't be the case. Asking the other way around there doesn't
> > >     seem to be any use for overlayfs inodes to have an i_version that
> > >     isn't just mirroring the relevant layer's i_version.
> > > (2) Jeff's changes for ima to make it rely on vfs_getattr_nosec().
> > >     Currently, ima assumes that it will get the correct i_version from
> > >     an inode but that just doesn't hold for stacking filesystem.
> > > 
> > > While (1) would likely just fix the immediate bug (2) is correct and
> > > _robust_. If we change how attributes are handled vfs_*() helpers will
> > > get updated and ima with it. Poking at raw inodes without using
> > > appropriate helpers is much more likely to get ima into trouble.
> > 
> > In addition to properly setting the i_version for IMA, EVM has a
> > similar issue with i_generation and s_uuid. Adding them to
> > ovl_copyattr() seems to resolve it.   Does that make sense?
> > 
> > diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
> > index 923d66d131c1..cd0aeb828868 100644
> > --- a/fs/overlayfs/util.c
> > +++ b/fs/overlayfs/util.c
> > @@ -1118,5 +1118,8 @@ void ovl_copyattr(struct inode *inode)
> >  	inode->i_atime = realinode->i_atime;
> >  	inode->i_mtime = realinode->i_mtime;
> >  	inode->i_ctime = realinode->i_ctime;
> > +	inode->i_generation = realinode->i_generation;
> > +	if (inode->i_sb)
> > +		uuid_copy(&inode->i_sb->s_uuid, &realinode->i_sb-
> 
> Overlayfs can consist of multiple lower layers and each of those lower
> layers may have a different uuid. So everytime you trigger a
> ovl_copyattr() on a different layer this patch would alter the uuid of
> the overlayfs superblock.
> 
> In addition the uuid should be set when the filesystem is mounted.
> Unless the filesystem implements a dedicated ioctl() - like ext4 - to
> change the uuid.

IMO, that ext4 functionality is a landmine waiting to be stepped on.

We should not be changing the sb->s_uuid of filesysetms dynamically.
The VFS does not guarantee in any way that it is safe to change the
sb->s_uuid (i.e. no locking, no change notifications, no udev
events, etc). Various subsystems - both in the kernel and in
userspace - use the sb->s_uuid as a canonical and/or persistent
filesystem/device identifier and are unprepared to have it change
while the filesystem is mounted and active.

I commented on this from an XFS perspective here when it was
proposed to copy this ext4 mis-feature in XFS:

https://lore.kernel.org/linux-xfs/20230314062847.GQ360264@dread.disaster.area/

Further to this, I also suspect that changing uuids online will
cause issues with userspace caching of fs uuids (e.g. libblkid and
anything that uses it) and information that uses uuids to identify
the filesystem that are set up at mount time (/dev/disk/by-uuid/
links, etc) by kernel events sent to userspace helpers...

IMO, we shouldn't even be considering dynamic sb->s_uuid changes
without first working through the full system impacts of having
persistent userspace-visible filesystem identifiers change
dynamically...

-Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 61+ messages in thread

* uuid ioctl - was: Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-05-21 22:49                             ` Dave Chinner
@ 2023-05-22 10:50                               ` Christian Brauner
  2023-06-02  1:23                                 ` Darrick J. Wong
  2023-05-23 17:35                               ` Mimi Zohar
  1 sibling, 1 reply; 61+ messages in thread
From: Christian Brauner @ 2023-05-22 10:50 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Amir Goldstein, Jeff Layton, miklos, linux-fsdevel, linux-xfs

On Mon, May 22, 2023 at 08:49:50AM +1000, Dave Chinner wrote:
> On Sat, May 20, 2023 at 11:17:35AM +0200, Christian Brauner wrote:
> > On Fri, May 19, 2023 at 03:42:38PM -0400, Mimi Zohar wrote:
> > > On Fri, 2023-04-07 at 10:31 +0200, Christian Brauner wrote:
> > > > So, I think we want both; we want the ovl_copyattr() and the
> > > > vfs_getattr_nosec() change:
> > > > 
> > > > (1) overlayfs should copy up the inode version in ovl_copyattr(). That
> > > >     is in line what we do with all other inode attributes. IOW, the
> > > >     overlayfs inode's i_version counter should aim to mirror the
> > > >     relevant layer's i_version counter. I wouldn't know why that
> > > >     shouldn't be the case. Asking the other way around there doesn't
> > > >     seem to be any use for overlayfs inodes to have an i_version that
> > > >     isn't just mirroring the relevant layer's i_version.
> > > > (2) Jeff's changes for ima to make it rely on vfs_getattr_nosec().
> > > >     Currently, ima assumes that it will get the correct i_version from
> > > >     an inode but that just doesn't hold for stacking filesystem.
> > > > 
> > > > While (1) would likely just fix the immediate bug (2) is correct and
> > > > _robust_. If we change how attributes are handled vfs_*() helpers will
> > > > get updated and ima with it. Poking at raw inodes without using
> > > > appropriate helpers is much more likely to get ima into trouble.
> > > 
> > > In addition to properly setting the i_version for IMA, EVM has a
> > > similar issue with i_generation and s_uuid. Adding them to
> > > ovl_copyattr() seems to resolve it.   Does that make sense?
> > > 
> > > diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
> > > index 923d66d131c1..cd0aeb828868 100644
> > > --- a/fs/overlayfs/util.c
> > > +++ b/fs/overlayfs/util.c
> > > @@ -1118,5 +1118,8 @@ void ovl_copyattr(struct inode *inode)
> > >  	inode->i_atime = realinode->i_atime;
> > >  	inode->i_mtime = realinode->i_mtime;
> > >  	inode->i_ctime = realinode->i_ctime;
> > > +	inode->i_generation = realinode->i_generation;
> > > +	if (inode->i_sb)
> > > +		uuid_copy(&inode->i_sb->s_uuid, &realinode->i_sb-
> > 
> > Overlayfs can consist of multiple lower layers and each of those lower
> > layers may have a different uuid. So everytime you trigger a
> > ovl_copyattr() on a different layer this patch would alter the uuid of
> > the overlayfs superblock.
> > 
> > In addition the uuid should be set when the filesystem is mounted.
> > Unless the filesystem implements a dedicated ioctl() - like ext4 - to
> > change the uuid.
> 
> IMO, that ext4 functionality is a landmine waiting to be stepped on.
> 
> We should not be changing the sb->s_uuid of filesysetms dynamically.

Yeah, I kinda agree. If it works for ext4 and it's an ext4 specific
ioctl then this is fine though.

Thanks for bringing this up. I had some thoughts on this (mostly at the
end of this mail) but haven't had the time to express them.

> The VFS does not guarantee in any way that it is safe to change the
> sb->s_uuid (i.e. no locking, no change notifications, no udev
> events, etc). Various subsystems - both in the kernel and in
> userspace - use the sb->s_uuid as a canonical and/or persistent
> filesystem/device identifier and are unprepared to have it change
> while the filesystem is mounted and active.

Yes, it is not a VFS concept for sure.

> 
> I commented on this from an XFS perspective here when it was
> proposed to copy this ext4 mis-feature in XFS:
> 
> https://lore.kernel.org/linux-xfs/20230314062847.GQ360264@dread.disaster.area/

So I read the thread back then and I agree with you specifically about:

* changing uuid dynamically isn't a well-defined concept
* hoisting the ext4 specific ioctl that allows changing the uuid
  dynamically into a generic vfs ioctl is premature and gives the
  impression that this is a well-defined concept when it isn't.
* the chosen data structure with a flexible array member would probably
  work but is suboptimal

> 
> Further to this, I also suspect that changing uuids online will
> cause issues with userspace caching of fs uuids (e.g. libblkid and
> anything that uses it) and information that uses uuids to identify
> the filesystem that are set up at mount time (/dev/disk/by-uuid/
> links, etc) by kernel events sent to userspace helpers...

Yeah, that's a valid concern as it's common practice to put uuids into
/etc/fstab so if they were allowed to change while the filesystem is
mounted/superblock is active the minimum thing needed is for userspace
get a uevent so the /dev/disk/by-uuid/$uuid symlink can be updated by
udev. But I digress.

> 
> IMO, we shouldn't even be considering dynamic sb->s_uuid changes
> without first working through the full system impacts of having
> persistent userspace-visible filesystem identifiers change
> dynamically...

Yes.

---

The thing that I think we could do is have all filesystems that can
reasonably support it set a uuid. We currently don't do that. If we
would start doing that then all filesystems that currently don't
implement a separate f_fsid based on e.g., the disk's device number can
just generate the f_fsid based on the uuid. This will make all these
filesystems available to be used with fanotify - which requires f_fsid
to be set for its most useful features.

This is often the most useful for filesystems such as tmpfs which gained
support for uuids quite recently. For such pseudo filesystems the
lifetime of the uuid would be the lifetime of the superblock in contrast
to filesystems like xfs that persist the uuid to disk. IOW, if you
mount -t tmpfs tmpfs /mnt; umount /mnt; mount -t tmpfs tmpfs /mnt then
you get a new uuid but the uuid stays fixed for the lifetime of the
superblock and can't be changed.

So the patchset that you objected had one part that made sense to me
which was to hoist the ioctl that _gets_ the uuid from a filesystems
into a generic ioctl. But I agree that the structure wasn't chosen
nicely. I would prefer if this was a fixed size but extensible structure
which is a concept we've had for a long time. So say we were to chose
the following structure layout for the generic ioctl:

struct fsuuid {
        __u32       fsu_len;
        __u32       fsu_flags;
        __u8        fsu_uuid[16]; // 8 * 16 = 128 = 64 * 2
};

then this would be compatible with ext4. It would also be extensible if
we wanted to add additional fields in the future or switch to a new uuid
format or whatever.

A while back we did work for extensible struct in system calls but these
extensible structs also work with ioctls.

For example, see what we did for kernel/seccomp.c:

        /* Extensible Argument ioctls */
        #define EA_IOCTL(cmd)   ((cmd) & ~(IOC_INOUT | IOCSIZE_MASK))

        switch (EA_IOCTL(cmd)) {
        case EA_IOCTL(SECCOMP_IOCTL_NOTIF_ADDFD):
                return seccomp_notify_addfd(filter, buf, _IOC_SIZE(cmd));
        default:
                return -EINVAL;
        }

and then

        static long seccomp_notify_addfd(struct seccomp_filter *filter,
                                         struct seccomp_notif_addfd __user *uaddfd,
                                         unsigned int size)
        {
                [...]

                BUILD_BUG_ON(sizeof(addfd) < SECCOMP_NOTIFY_ADDFD_SIZE_VER0);
                BUILD_BUG_ON(sizeof(addfd) != SECCOMP_NOTIFY_ADDFD_SIZE_LATEST);

                if (size < SECCOMP_NOTIFY_ADDFD_SIZE_VER0 || size >= PAGE_SIZE)
                        return -EINVAL;

                ret = copy_struct_from_user(&addfd, sizeof(addfd), uaddfd, size);
                if (ret)
                        return ret;

                [...]
        }

So the struct is versioned by size the same as for system calls. The
difference for the ioctl is that the size is already encoded in the
ioctl when it is defined. So even with a fixed size struct it is
trivially possible to extend the struct later as long as the extension
is 64bit aligned.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-05-20  9:15                           ` Amir Goldstein
@ 2023-05-22 12:18                             ` Mimi Zohar
  2023-05-22 14:00                               ` Amir Goldstein
  0 siblings, 1 reply; 61+ messages in thread
From: Mimi Zohar @ 2023-05-22 12:18 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Christian Brauner, Jeff Layton, Stefan Berger, Paul Moore,
	linux-integrity, miklos, linux-kernel, linux-security-module,
	linux-fsdevel, linux-unionfs, Ignaz Forster, Petr Vorel

On Sat, 2023-05-20 at 12:15 +0300, Amir Goldstein wrote:
> On Fri, May 19, 2023 at 10:42 PM Mimi Zohar <zohar@linux.ibm.com> wrote:
> >
> > On Fri, 2023-04-07 at 10:31 +0200, Christian Brauner wrote:
> > > So, I think we want both; we want the ovl_copyattr() and the
> > > vfs_getattr_nosec() change:
> > >
> > > (1) overlayfs should copy up the inode version in ovl_copyattr(). That
> > >     is in line what we do with all other inode attributes. IOW, the
> > >     overlayfs inode's i_version counter should aim to mirror the
> > >     relevant layer's i_version counter. I wouldn't know why that
> > >     shouldn't be the case. Asking the other way around there doesn't
> > >     seem to be any use for overlayfs inodes to have an i_version that
> > >     isn't just mirroring the relevant layer's i_version.
> > > (2) Jeff's changes for ima to make it rely on vfs_getattr_nosec().
> > >     Currently, ima assumes that it will get the correct i_version from
> > >     an inode but that just doesn't hold for stacking filesystem.
> > >
> > > While (1) would likely just fix the immediate bug (2) is correct and
> > > _robust_. If we change how attributes are handled vfs_*() helpers will
> > > get updated and ima with it. Poking at raw inodes without using
> > > appropriate helpers is much more likely to get ima into trouble.
> >
> > In addition to properly setting the i_version for IMA, EVM has a
> > similar issue with i_generation and s_uuid. Adding them to
> > ovl_copyattr() seems to resolve it.   Does that make sense?
> >
> > diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
> > index 923d66d131c1..cd0aeb828868 100644
> > --- a/fs/overlayfs/util.c
> > +++ b/fs/overlayfs/util.c
> > @@ -1118,5 +1118,8 @@ void ovl_copyattr(struct inode *inode)
> >         inode->i_atime = realinode->i_atime;
> >         inode->i_mtime = realinode->i_mtime;
> >         inode->i_ctime = realinode->i_ctime;
> > +       inode->i_generation = realinode->i_generation;
> > +       if (inode->i_sb)
> > +               uuid_copy(&inode->i_sb->s_uuid, &realinode->i_sb-
> > >s_uuid);
> 
> That is not a possible solution Mimi.
> 
> The i_gneration copy *may* be acceptable in "all layers on same fs"
> setup, but changing overlayfs s_uuid over and over is a non-starter.
> 
> If you explain the problem, I may be able to help you find a better solution.

EVM calculates an HMAC of the file metadata (security xattrs, i_ino,
i_generation, i_uid, i_gid, i_mode, s_uuid)  and stores it as
security.evm.  Notrmally this would be used for mutable files, which
cannot be signed.  The i_generation and s_uuid on the lower layer and
the overlay are not the same, causing the EVM HMAC verification to
fail.

-- 
thanks,

Mimi


^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-05-22 12:18                             ` Mimi Zohar
@ 2023-05-22 14:00                               ` Amir Goldstein
  2023-05-23 19:38                                 ` Mimi Zohar
  0 siblings, 1 reply; 61+ messages in thread
From: Amir Goldstein @ 2023-05-22 14:00 UTC (permalink / raw)
  To: Mimi Zohar
  Cc: Christian Brauner, Jeff Layton, Stefan Berger, Paul Moore,
	linux-integrity, miklos, linux-kernel, linux-security-module,
	linux-fsdevel, linux-unionfs, Ignaz Forster, Petr Vorel

On Mon, May 22, 2023 at 3:18 PM Mimi Zohar <zohar@linux.ibm.com> wrote:
>
> On Sat, 2023-05-20 at 12:15 +0300, Amir Goldstein wrote:
> > On Fri, May 19, 2023 at 10:42 PM Mimi Zohar <zohar@linux.ibm.com> wrote:
> > >
> > > On Fri, 2023-04-07 at 10:31 +0200, Christian Brauner wrote:
> > > > So, I think we want both; we want the ovl_copyattr() and the
> > > > vfs_getattr_nosec() change:
> > > >
> > > > (1) overlayfs should copy up the inode version in ovl_copyattr(). That
> > > >     is in line what we do with all other inode attributes. IOW, the
> > > >     overlayfs inode's i_version counter should aim to mirror the
> > > >     relevant layer's i_version counter. I wouldn't know why that
> > > >     shouldn't be the case. Asking the other way around there doesn't
> > > >     seem to be any use for overlayfs inodes to have an i_version that
> > > >     isn't just mirroring the relevant layer's i_version.
> > > > (2) Jeff's changes for ima to make it rely on vfs_getattr_nosec().
> > > >     Currently, ima assumes that it will get the correct i_version from
> > > >     an inode but that just doesn't hold for stacking filesystem.
> > > >
> > > > While (1) would likely just fix the immediate bug (2) is correct and
> > > > _robust_. If we change how attributes are handled vfs_*() helpers will
> > > > get updated and ima with it. Poking at raw inodes without using
> > > > appropriate helpers is much more likely to get ima into trouble.
> > >
> > > In addition to properly setting the i_version for IMA, EVM has a
> > > similar issue with i_generation and s_uuid. Adding them to
> > > ovl_copyattr() seems to resolve it.   Does that make sense?
> > >
> > > diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
> > > index 923d66d131c1..cd0aeb828868 100644
> > > --- a/fs/overlayfs/util.c
> > > +++ b/fs/overlayfs/util.c
> > > @@ -1118,5 +1118,8 @@ void ovl_copyattr(struct inode *inode)
> > >         inode->i_atime = realinode->i_atime;
> > >         inode->i_mtime = realinode->i_mtime;
> > >         inode->i_ctime = realinode->i_ctime;
> > > +       inode->i_generation = realinode->i_generation;
> > > +       if (inode->i_sb)
> > > +               uuid_copy(&inode->i_sb->s_uuid, &realinode->i_sb-
> > > >s_uuid);
> >
> > That is not a possible solution Mimi.
> >
> > The i_gneration copy *may* be acceptable in "all layers on same fs"
> > setup, but changing overlayfs s_uuid over and over is a non-starter.
> >
> > If you explain the problem, I may be able to help you find a better solution.
>
> EVM calculates an HMAC of the file metadata (security xattrs, i_ino,
> i_generation, i_uid, i_gid, i_mode, s_uuid)  and stores it as
> security.evm.  Notrmally this would be used for mutable files, which
> cannot be signed.  The i_generation and s_uuid on the lower layer and
> the overlay are not the same, causing the EVM HMAC verification to
> fail.
>

OK, so EVM expects i_ino, i_generation, i_uid, i_gid, i_mode, s_uuid
and security xattr to remain stable and persistent (survive umount/mount).
Correct?

You cannot expect that the same EVM xattr will correctly describe both
the overlayfs inode and the underlying real fs inode, because they may
vary in some of the metadata, so need to decide if you only want to attest
overlayfs inodes, real underlying inodes or both.
If both, then the same EVM xattr cannot be used, but as it is, overlayfs
inode has no "private" xattr version, it stores its xattr on the underlying
real inode.

i_uid, i_gid, i_mode:
Should be stable and persistent for overlayfs inode and survive copy up.
Should be identical to the underlying inode.

security xattr:
Overlayfs tries to copy up all security.* xattr and also calls the LSM
hook security_inode_copy_up_xattr() to approve each copied xattr.
Should be identical to the underlying inode.

s_uuid:
So far, overlayfs sb has a null uuid.
With this patch, overlayfs will gain a persistent s_uuid, just like any
other disk fs with the opt-in feature index=on:
https://lore.kernel.org/linux-unionfs/20230425132223.2608226-4-amir73il@gmail.com/
Should be different from the underlying fs uuid when there is more
than one underlying fs.
We can consider inheriting s_uuid from underlying fs when all layers
are on the same fs.

i_ino:
As documented in:
https://github.com/torvalds/linux/blob/master/Documentation/filesystems/overlayfs.rst#inode-properties
It should be persistent and survive copy up with the
xino=auto feature (module param or mount option) or
CONFIG_OVERLAY_FS_XINO_AUTO=y
which is not the kernel default, but already set by some distros.
Will be identical to the underlying inode only in some special cases
such as pure upper (not copied up) inodes.
Will be different from the underlying lower file inode many in other cases.

i_generation:
For xino=auto, we could follow the same rules as i_ino and get similar
qualities -
i_generation will become persistent and survive copy up, but it will not be
identical to the real underlying inode i_generation in many cases.

Bottom line:
If you only want to attest overlayfs inodes - shouldn't be too hard
If you want to attest both overlayfs inodes AND their backing "real" inodes -
much more challenging.

Hope that this writeup helps more than it confuses.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-05-21 22:49                             ` Dave Chinner
  2023-05-22 10:50                               ` uuid ioctl - was: " Christian Brauner
@ 2023-05-23 17:35                               ` Mimi Zohar
  1 sibling, 0 replies; 61+ messages in thread
From: Mimi Zohar @ 2023-05-23 17:35 UTC (permalink / raw)
  To: Dave Chinner, Christian Brauner
  Cc: Amir Goldstein, Jeff Layton, Stefan Berger, Paul Moore,
	linux-integrity, miklos, linux-kernel, linux-security-module,
	linux-fsdevel, linux-unionfs, Ignaz Forster, Petr Vorel

On Mon, 2023-05-22 at 08:49 +1000, Dave Chinner wrote:


> > In addition the uuid should be set when the filesystem is mounted.
> > Unless the filesystem implements a dedicated ioctl() - like ext4 - to
> > change the uuid.
> 
> IMO, that ext4 functionality is a landmine waiting to be stepped on.
> 
> We should not be changing the sb->s_uuid of filesysetms dynamically.
> The VFS does not guarantee in any way that it is safe to change the
> sb->s_uuid (i.e. no locking, no change notifications, no udev
> events, etc). Various subsystems - both in the kernel and in
> userspace - use the sb->s_uuid as a canonical and/or persistent
> filesystem/device identifier and are unprepared to have it change
> while the filesystem is mounted and active.
> 
> I commented on this from an XFS perspective here when it was
> proposed to copy this ext4 mis-feature in XFS:
> 
> https://lore.kernel.org/linux-xfs/20230314062847.GQ360264@dread.disaster.area/
> 
> Further to this, I also suspect that changing uuids online will
> cause issues with userspace caching of fs uuids (e.g. libblkid and
> anything that uses it) and information that uses uuids to identify
> the filesystem that are set up at mount time (/dev/disk/by-uuid/
> links, etc) by kernel events sent to userspace helpers...
> 
> IMO, we shouldn't even be considering dynamic sb->s_uuid changes
> without first working through the full system impacts of having
> persistent userspace-visible filesystem identifiers change
> dynamically...

Oh!   FYI, we've started using the ability to change the UUID for IMA
testing.  IMA policy rules can be defined in terms of the UUID without
impacting the existing policy rules.  Changing the UUID can be used to
enable different tests without interferring with existing policy rules.

Mimi


^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-05-22 14:00                               ` Amir Goldstein
@ 2023-05-23 19:38                                 ` Mimi Zohar
  0 siblings, 0 replies; 61+ messages in thread
From: Mimi Zohar @ 2023-05-23 19:38 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Christian Brauner, Jeff Layton, Stefan Berger, Paul Moore,
	linux-integrity, miklos, linux-kernel, linux-security-module,
	linux-fsdevel, linux-unionfs, Ignaz Forster, Petr Vorel

On Mon, 2023-05-22 at 17:00 +0300, Amir Goldstein wrote:
> On Mon, May 22, 2023 at 3:18 PM Mimi Zohar <zohar@linux.ibm.com> wrote:
> >
> > On Sat, 2023-05-20 at 12:15 +0300, Amir Goldstein wrote:
> > > On Fri, May 19, 2023 at 10:42 PM Mimi Zohar <zohar@linux.ibm.com> wrote:
> > > >
> > > > On Fri, 2023-04-07 at 10:31 +0200, Christian Brauner wrote:
> > > > > So, I think we want both; we want the ovl_copyattr() and the
> > > > > vfs_getattr_nosec() change:
> > > > >
> > > > > (1) overlayfs should copy up the inode version in ovl_copyattr(). That
> > > > >     is in line what we do with all other inode attributes. IOW, the
> > > > >     overlayfs inode's i_version counter should aim to mirror the
> > > > >     relevant layer's i_version counter. I wouldn't know why that
> > > > >     shouldn't be the case. Asking the other way around there doesn't
> > > > >     seem to be any use for overlayfs inodes to have an i_version that
> > > > >     isn't just mirroring the relevant layer's i_version.
> > > > > (2) Jeff's changes for ima to make it rely on vfs_getattr_nosec().
> > > > >     Currently, ima assumes that it will get the correct i_version from
> > > > >     an inode but that just doesn't hold for stacking filesystem.
> > > > >
> > > > > While (1) would likely just fix the immediate bug (2) is correct and
> > > > > _robust_. If we change how attributes are handled vfs_*() helpers will
> > > > > get updated and ima with it. Poking at raw inodes without using
> > > > > appropriate helpers is much more likely to get ima into trouble.
> > > >
> > > > In addition to properly setting the i_version for IMA, EVM has a
> > > > similar issue with i_generation and s_uuid. Adding them to
> > > > ovl_copyattr() seems to resolve it.   Does that make sense?
> > > >
> > > > diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
> > > > index 923d66d131c1..cd0aeb828868 100644
> > > > --- a/fs/overlayfs/util.c
> > > > +++ b/fs/overlayfs/util.c
> > > > @@ -1118,5 +1118,8 @@ void ovl_copyattr(struct inode *inode)
> > > >         inode->i_atime = realinode->i_atime;
> > > >         inode->i_mtime = realinode->i_mtime;
> > > >         inode->i_ctime = realinode->i_ctime;
> > > > +       inode->i_generation = realinode->i_generation;
> > > > +       if (inode->i_sb)
> > > > +               uuid_copy(&inode->i_sb->s_uuid, &realinode->i_sb-
> > > > >s_uuid);
> > >
> > > That is not a possible solution Mimi.
> > >
> > > The i_gneration copy *may* be acceptable in "all layers on same fs"
> > > setup, but changing overlayfs s_uuid over and over is a non-starter.
> > >
> > > If you explain the problem, I may be able to help you find a better solution.
> >
> > EVM calculates an HMAC of the file metadata (security xattrs, i_ino,
> > i_generation, i_uid, i_gid, i_mode, s_uuid)  and stores it as
> > security.evm.  Notrmally this would be used for mutable files, which
> > cannot be signed.  The i_generation and s_uuid on the lower layer and
> > the overlay are not the same, causing the EVM HMAC verification to
> > fail.
> >
> 
> OK, so EVM expects i_ino, i_generation, i_uid, i_gid, i_mode, s_uuid
> and security xattr to remain stable and persistent (survive umount/mount).
> Correct?

Yes

> 
> You cannot expect that the same EVM xattr will correctly describe both
> the overlayfs inode and the underlying real fs inode, because they may
> vary in some of the metadata, so need to decide if you only want to attest
> overlayfs inodes, real underlying inodes or both.

Understood.  Accessing a file on the overlay filesystem then needs to
be verified based on the backing file metadata.  Currently that isn't
being done.  So either all the backing file metadata needs to be copied
up or some other change(s) need to be made.

> If both, then the same EVM xattr cannot be used, but as it is, overlayfs
> inode has no "private" xattr version, it stores its xattr on the underlying
> real inode.
> 
> i_uid, i_gid, i_mode:
> Should be stable and persistent for overlayfs inode and survive copy up.
> Should be identical to the underlying inode.
> 
> security xattr:
> Overlayfs tries to copy up all security.* xattr and also calls the LSM
> hook security_inode_copy_up_xattr() to approve each copied xattr.
> Should be identical to the underlying inode.

> s_uuid:
> So far, overlayfs sb has a null uuid.
> With this patch, overlayfs will gain a persistent s_uuid, just like any
> other disk fs with the opt-in feature index=on:
> https://lore.kernel.org/linux-unionfs/20230425132223.2608226-4-amir73il@gmail.com/
> Should be different from the underlying fs uuid when there is more
> than one underlying fs.
> We can consider inheriting s_uuid from underlying fs when all layers
> are on the same fs.
> 
> i_ino:
> As documented in:
> https://github.com/torvalds/linux/blob/master/Documentation/filesystems/overlayfs.rst#inode-properties
> It should be persistent and survive copy up with the
> xino=auto feature (module param or mount option) or
> CONFIG_OVERLAY_FS_XINO_AUTO=y
> which is not the kernel default, but already set by some distros.
> Will be identical to the underlying inode only in some special cases
> such as pure upper (not copied up) inodes.
> Will be different from the underlying lower file inode many in other cases.
> 
> i_generation:
> For xino=auto, we could follow the same rules as i_ino and get similar
> qualities -
> i_generation will become persistent and survive copy up, but it will not be
> identical to the real underlying inode i_generation in many cases.
> 
> Bottom line:
> If you only want to attest overlayfs inodes - shouldn't be too hard
> If you want to attest both overlayfs inodes AND their backing "real" inodes -
> much more challenging.
> 
> Hope that this writeup helps more than it confuses.

Thanks, Amir.   It definitely helps.

To summarize what I'm seeing (IMA hash and EVM HMAC):

- Directly accessing overlay files, "lower" backed file, fails to
verify without copying all the file metadata up.

- Writing directly to the "upper" backing file properly updates the
file metadata.

- Writing directly to the overlay file does not write security.ima
either to the overlayfs  or the "upper" backing file.

policy rules:
appraise func=FILE_CHECK fsuuid=....
measure func=FILE_CHECK fsuuid=....
appraise func=FILE_CHECK fsname=overlay 
measure func=FILE_CHECK fsname=overlay

Mimi


^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-05-19 14:58                                 ` Paul Moore
@ 2023-05-25 14:43                                   ` Mimi Zohar
  0 siblings, 0 replies; 61+ messages in thread
From: Mimi Zohar @ 2023-05-25 14:43 UTC (permalink / raw)
  To: Paul Moore
  Cc: Stefan Berger, Jeff Layton, Christian Brauner, Amir Goldstein,
	linux-integrity, miklos, linux-kernel, linux-security-module,
	linux-fsdevel, linux-unionfs

On Fri, 2023-05-19 at 10:58 -0400, Paul Moore wrote:
> On Thu, May 18, 2023 at 4:56 PM Mimi Zohar <zohar@linux.ibm.com> wrote:
> > On Thu, 2023-05-18 at 16:46 -0400, Paul Moore wrote:
> > > On Fri, Apr 21, 2023 at 10:44 AM Mimi Zohar <zohar@linux.ibm.com> wrote:
> > > > On Fri, 2023-04-07 at 09:29 -0400, Jeff Layton wrote:
> 
> ...
> 
> > > I'm going through my review queue to make sure I haven't missed
> > > anything and this thread popped up ... Stefan, Mimi, did you get a fix
> > > into an upstream tree somewhere?  If not, is it because you are
> > > waiting on a review/merge from me into the LSM tree?
> >
> > Sorry for the delay.  Between vacation and LSS, I just started testing
> > Jeff Layton's patch.
> 
> No worries, I'm a bit behind too, I just wanted to make sure I wasn't
> blocking this thread :)

FYI, Jeff Layton's patch is now queued in next-integrity.

-- 
thanks,

Mimi


^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: uuid ioctl - was: Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-05-22 10:50                               ` uuid ioctl - was: " Christian Brauner
@ 2023-06-02  1:23                                 ` Darrick J. Wong
  2023-06-02  4:27                                   ` Theodore Ts'o
  2023-06-02 13:14                                   ` Christian Brauner
  0 siblings, 2 replies; 61+ messages in thread
From: Darrick J. Wong @ 2023-06-02  1:23 UTC (permalink / raw)
  To: Christian Brauner, Theodore Ts'o
  Cc: Dave Chinner, Amir Goldstein, Jeff Layton, miklos, linux-fsdevel,
	linux-xfs

Someone ought to cc Ted since I asked him about this topic this morning
and he said he hadn't noticed it going by...

On Mon, May 22, 2023 at 12:50:19PM +0200, Christian Brauner wrote:
> On Mon, May 22, 2023 at 08:49:50AM +1000, Dave Chinner wrote:
> > On Sat, May 20, 2023 at 11:17:35AM +0200, Christian Brauner wrote:
> > > On Fri, May 19, 2023 at 03:42:38PM -0400, Mimi Zohar wrote:
> > > > On Fri, 2023-04-07 at 10:31 +0200, Christian Brauner wrote:
> > > > > So, I think we want both; we want the ovl_copyattr() and the
> > > > > vfs_getattr_nosec() change:
> > > > > 
> > > > > (1) overlayfs should copy up the inode version in ovl_copyattr(). That
> > > > >     is in line what we do with all other inode attributes. IOW, the
> > > > >     overlayfs inode's i_version counter should aim to mirror the
> > > > >     relevant layer's i_version counter. I wouldn't know why that
> > > > >     shouldn't be the case. Asking the other way around there doesn't
> > > > >     seem to be any use for overlayfs inodes to have an i_version that
> > > > >     isn't just mirroring the relevant layer's i_version.
> > > > > (2) Jeff's changes for ima to make it rely on vfs_getattr_nosec().
> > > > >     Currently, ima assumes that it will get the correct i_version from
> > > > >     an inode but that just doesn't hold for stacking filesystem.
> > > > > 
> > > > > While (1) would likely just fix the immediate bug (2) is correct and
> > > > > _robust_. If we change how attributes are handled vfs_*() helpers will
> > > > > get updated and ima with it. Poking at raw inodes without using
> > > > > appropriate helpers is much more likely to get ima into trouble.
> > > > 
> > > > In addition to properly setting the i_version for IMA, EVM has a
> > > > similar issue with i_generation and s_uuid. Adding them to
> > > > ovl_copyattr() seems to resolve it.   Does that make sense?
> > > > 
> > > > diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
> > > > index 923d66d131c1..cd0aeb828868 100644
> > > > --- a/fs/overlayfs/util.c
> > > > +++ b/fs/overlayfs/util.c
> > > > @@ -1118,5 +1118,8 @@ void ovl_copyattr(struct inode *inode)
> > > >  	inode->i_atime = realinode->i_atime;
> > > >  	inode->i_mtime = realinode->i_mtime;
> > > >  	inode->i_ctime = realinode->i_ctime;
> > > > +	inode->i_generation = realinode->i_generation;
> > > > +	if (inode->i_sb)
> > > > +		uuid_copy(&inode->i_sb->s_uuid, &realinode->i_sb-
> > > 
> > > Overlayfs can consist of multiple lower layers and each of those lower
> > > layers may have a different uuid. So everytime you trigger a
> > > ovl_copyattr() on a different layer this patch would alter the uuid of
> > > the overlayfs superblock.
> > > 
> > > In addition the uuid should be set when the filesystem is mounted.
> > > Unless the filesystem implements a dedicated ioctl() - like ext4 - to
> > > change the uuid.
> > 
> > IMO, that ext4 functionality is a landmine waiting to be stepped on.
> > 
> > We should not be changing the sb->s_uuid of filesysetms dynamically.
> 
> Yeah, I kinda agree. If it works for ext4 and it's an ext4 specific
> ioctl then this is fine though.

Now that Dave's brought up all kinds of questions about other parts of
the kernel using s_uuid for things, I'm starting to think that even ext4
shouldn't be changing its own uuid on the fly.

Unless, of course, someone writes a way to record when s_uuid has been
accessed by something (e.g. pnfs, ima, etc) to lock out changes to the
value...

> Thanks for bringing this up. I had some thoughts on this (mostly at the
> end of this mail) but haven't had the time to express them.
> 
> > The VFS does not guarantee in any way that it is safe to change the
> > sb->s_uuid (i.e. no locking, no change notifications, no udev
> > events, etc). Various subsystems - both in the kernel and in
> > userspace - use the sb->s_uuid as a canonical and/or persistent
> > filesystem/device identifier and are unprepared to have it change
> > while the filesystem is mounted and active.

...just like Dave just said.  Heh. :(

--D

> Yes, it is not a VFS concept for sure.
> 
> > 
> > I commented on this from an XFS perspective here when it was
> > proposed to copy this ext4 mis-feature in XFS:
> > 
> > https://lore.kernel.org/linux-xfs/20230314062847.GQ360264@dread.disaster.area/
> 
> So I read the thread back then and I agree with you specifically about:
> 
> * changing uuid dynamically isn't a well-defined concept
> * hoisting the ext4 specific ioctl that allows changing the uuid
>   dynamically into a generic vfs ioctl is premature and gives the
>   impression that this is a well-defined concept when it isn't.
> * the chosen data structure with a flexible array member would probably
>   work but is suboptimal
> 
> > 
> > Further to this, I also suspect that changing uuids online will
> > cause issues with userspace caching of fs uuids (e.g. libblkid and
> > anything that uses it) and information that uses uuids to identify
> > the filesystem that are set up at mount time (/dev/disk/by-uuid/
> > links, etc) by kernel events sent to userspace helpers...
> 
> Yeah, that's a valid concern as it's common practice to put uuids into
> /etc/fstab so if they were allowed to change while the filesystem is
> mounted/superblock is active the minimum thing needed is for userspace
> get a uevent so the /dev/disk/by-uuid/$uuid symlink can be updated by
> udev. But I digress.
> 
> > 
> > IMO, we shouldn't even be considering dynamic sb->s_uuid changes
> > without first working through the full system impacts of having
> > persistent userspace-visible filesystem identifiers change
> > dynamically...
> 
> Yes.
> 
> ---
> 
> The thing that I think we could do is have all filesystems that can
> reasonably support it set a uuid. We currently don't do that. If we
> would start doing that then all filesystems that currently don't
> implement a separate f_fsid based on e.g., the disk's device number can
> just generate the f_fsid based on the uuid. This will make all these
> filesystems available to be used with fanotify - which requires f_fsid
> to be set for its most useful features.
> 
> This is often the most useful for filesystems such as tmpfs which gained
> support for uuids quite recently. For such pseudo filesystems the
> lifetime of the uuid would be the lifetime of the superblock in contrast
> to filesystems like xfs that persist the uuid to disk. IOW, if you
> mount -t tmpfs tmpfs /mnt; umount /mnt; mount -t tmpfs tmpfs /mnt then
> you get a new uuid but the uuid stays fixed for the lifetime of the
> superblock and can't be changed.
> 
> So the patchset that you objected had one part that made sense to me
> which was to hoist the ioctl that _gets_ the uuid from a filesystems
> into a generic ioctl. But I agree that the structure wasn't chosen
> nicely. I would prefer if this was a fixed size but extensible structure
> which is a concept we've had for a long time. So say we were to chose
> the following structure layout for the generic ioctl:
> 
> struct fsuuid {
>         __u32       fsu_len;
>         __u32       fsu_flags;
>         __u8        fsu_uuid[16]; // 8 * 16 = 128 = 64 * 2
> };
> 
> then this would be compatible with ext4. It would also be extensible if
> we wanted to add additional fields in the future or switch to a new uuid
> format or whatever.
> 
> A while back we did work for extensible struct in system calls but these
> extensible structs also work with ioctls.
> 
> For example, see what we did for kernel/seccomp.c:
> 
>         /* Extensible Argument ioctls */
>         #define EA_IOCTL(cmd)   ((cmd) & ~(IOC_INOUT | IOCSIZE_MASK))
> 
>         switch (EA_IOCTL(cmd)) {
>         case EA_IOCTL(SECCOMP_IOCTL_NOTIF_ADDFD):
>                 return seccomp_notify_addfd(filter, buf, _IOC_SIZE(cmd));
>         default:
>                 return -EINVAL;
>         }
> 
> and then
> 
>         static long seccomp_notify_addfd(struct seccomp_filter *filter,
>                                          struct seccomp_notif_addfd __user *uaddfd,
>                                          unsigned int size)
>         {
>                 [...]
>         
>                 BUILD_BUG_ON(sizeof(addfd) < SECCOMP_NOTIFY_ADDFD_SIZE_VER0);
>                 BUILD_BUG_ON(sizeof(addfd) != SECCOMP_NOTIFY_ADDFD_SIZE_LATEST);
>         
>                 if (size < SECCOMP_NOTIFY_ADDFD_SIZE_VER0 || size >= PAGE_SIZE)
>                         return -EINVAL;
>         
>                 ret = copy_struct_from_user(&addfd, sizeof(addfd), uaddfd, size);
>                 if (ret)
>                         return ret;
>         
>                 [...]
>         }
> 
> So the struct is versioned by size the same as for system calls. The
> difference for the ioctl is that the size is already encoded in the
> ioctl when it is defined. So even with a fixed size struct it is
> trivially possible to extend the struct later as long as the extension
> is 64bit aligned.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: uuid ioctl - was: Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-06-02  1:23                                 ` Darrick J. Wong
@ 2023-06-02  4:27                                   ` Theodore Ts'o
  2023-06-02  6:34                                     ` Dave Chinner
  2023-06-02 13:14                                   ` Christian Brauner
  1 sibling, 1 reply; 61+ messages in thread
From: Theodore Ts'o @ 2023-06-02  4:27 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Christian Brauner, Dave Chinner, Amir Goldstein, Jeff Layton,
	miklos, linux-fsdevel, linux-xfs

On Thu, Jun 01, 2023 at 06:23:35PM -0700, Darrick J. Wong wrote:
> Someone ought to cc Ted since I asked him about this topic this morning
> and he said he hadn't noticed it going by...
> 
> > > > In addition the uuid should be set when the filesystem is mounted.
> > > > Unless the filesystem implements a dedicated ioctl() - like ext4 - to
> > > > change the uuid.
> > > 
> > > IMO, that ext4 functionality is a landmine waiting to be stepped on.
> > > 
> > > We should not be changing the sb->s_uuid of filesysetms dynamically.
> > 
> > Yeah, I kinda agree. If it works for ext4 and it's an ext4 specific
> > ioctl then this is fine though.
> 
> Now that Dave's brought up all kinds of questions about other parts of
> the kernel using s_uuid for things, I'm starting to think that even ext4
> shouldn't be changing its own uuid on the fly.

So let's set some context here.  The tune2fs program in e2fsprogs has
supported changing the UUID for a *very* long time.  Specifically,
since September 7, 1996 (e2fsprogs version 1.05, when we first added
the UUID field in the ext2 superblock).  This feature was added from
the very beginning since in Large Installation System Administration
(LISA) systems, a very common thing to do is to image boot disks from
a "golden master", and then afterwards, you want to make sure the file
systems on each boot disk have a unique UUID; and this is done via
"tune2fs -U random /dev/sdXX".  Since I was working at MIT Project
Athena at the time, we regularly did this when installing Athena
client workstations, and when I added UUID support to ext2, I made
sure this feature was well-supported.

The tune2fs program allows the UUID to be changed via the file system
is mounted (with some caveats), which it did by directly modifying the
on-disk superblock.  Obviously, when it did that, it wouldn't change
sb->s_uuid "dynamically", although the next time the file system was
mounted, sb->s_uuid would get the new UUID.  If overlayfs and IMA are
expecting that a file system's UUID would stay consant and persistent
--- well, that's not true, and it has always been that way, since
there are tools that make it trivially easy for a system administrator
to adjust the UUID.

In addition to the LISA context, this feature is also commonly used in
various cloud deployments, since when you create a new VM, it
typically gets a new root file system, which is copied from a fixed,
read-only image.  So on a particular hyperscale cloud system, if we
didn't do anything special, there could be hundreds of thousands VM's
whose root file system would all have the same UUID, which would mean
that the UUID... isn't terribly unique.

There are many problems that can result, but for example, if the user
or SRE were to take a cloud-level block device snapshot of a
malfunctioning VM, and then attach that snapshot on another VM, it is
quite possible that there might be two file systems mounted on a
particular VM that both have the same UUID ---- one for the "real"
root file system, and the other for the "bad" root file system that is
being examined.  Attempts to do mounts or umounts by UUID will then
result in hilarity.  (Not to mention potentially confusing support
personnel who might be looking at a metadata-only dump of the file
system.)

And so a common practice is for some cloud agents or init scripts to
change the root file system's UUID to a new random value when the VM
is first initially booted.  Yes, this can potentially cause problems
if the UUID is in /etc/fstab, but these scripts will typically update
/etc/fstab and make other userspace adjustments while they are at it.

In the case of Cloud Optimized OS, the change of the UUID via "tune2fs
-U random /dev/sdaX" was done in one systemd unit file, while systemd
unit file would try to to resize the partition to fill the size of the
root file system (since the VM can be created with the root disk
larger than the minimum size required by the cloud image).  These two
unit files can run at the same time, and so there was a very small
probability that userspace directly changing the superblock could race
with file system resize operation, such that one or the other
operation failing due to a bad superblock checksum error.

This is the reason why we added the ext4 ioctl; it was intended for
the express use of "tune2fs -U", and like tune2fs -U, it doesn't
actually change sb->s_uuid; it only changes the on-disk superblock's
UUID.  This was mostly because we forgot about sb->s_uuid, to be
honest, but it means that regardless of whether "tune2fs -U" directly
modifies the block device, or uses the ext4 ioctl, the behaviour with
respect to sb->s_uuid is the same; it's not modified when the on-disk
uuid is changed.

> > > The VFS does not guarantee in any way that it is safe to change the
> > > sb->s_uuid (i.e. no locking, no change notifications, no udev
> > > events, etc). Various subsystems - both in the kernel and in
> > > userspace - use the sb->s_uuid as a canonical and/or persistent
> > > filesystem/device identifier and are unprepared to have it change
> > > while the filesystem is mounted and active.

Note that the last sentence is a bit ambiguous.  There is the question
of whether sb->s_uuid won't change while the file system is mounted,
and then there is the question of whether s_uuid is **persistent**
---- which is to say, that it won't change across mounts or reboots.

If there are subsystems like IMA, overlayfs, pnfs, et.al, which expect
that, I'm sorry, but sysadmin tools to make it trivially easy to
change the file system UUID long-predate these other subsystems, and
there *are* system adminsitrators --- particularly in the LISA or
Cloud context --- which have used "tune2fs -U" for good and proper
reasons.

> ...just like Dave just said.  Heh. :(

Heh, indeed.  :-/

					- Ted

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: uuid ioctl - was: Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-06-02  4:27                                   ` Theodore Ts'o
@ 2023-06-02  6:34                                     ` Dave Chinner
  2023-06-02 10:53                                       ` Amir Goldstein
                                                         ` (2 more replies)
  0 siblings, 3 replies; 61+ messages in thread
From: Dave Chinner @ 2023-06-02  6:34 UTC (permalink / raw)
  To: Theodore Ts'o
  Cc: Darrick J. Wong, Christian Brauner, Amir Goldstein, Jeff Layton,
	miklos, linux-fsdevel, linux-xfs

On Fri, Jun 02, 2023 at 12:27:14AM -0400, Theodore Ts'o wrote:
> On Thu, Jun 01, 2023 at 06:23:35PM -0700, Darrick J. Wong wrote:
> > Someone ought to cc Ted since I asked him about this topic this morning
> > and he said he hadn't noticed it going by...
> > 
> > > > > In addition the uuid should be set when the filesystem is mounted.
> > > > > Unless the filesystem implements a dedicated ioctl() - like ext4 - to
> > > > > change the uuid.
> > > > 
> > > > IMO, that ext4 functionality is a landmine waiting to be stepped on.
> > > > 
> > > > We should not be changing the sb->s_uuid of filesysetms dynamically.
> > > 
> > > Yeah, I kinda agree. If it works for ext4 and it's an ext4 specific
> > > ioctl then this is fine though.
> > 
> > Now that Dave's brought up all kinds of questions about other parts of
> > the kernel using s_uuid for things, I'm starting to think that even ext4
> > shouldn't be changing its own uuid on the fly.
> 
> So let's set some context here.  The tune2fs program in e2fsprogs has
> supported changing the UUID for a *very* long time.  Specifically,
> since September 7, 1996 (e2fsprogs version 1.05, when we first added
> the UUID field in the ext2 superblock).

Yup, and XFS has supported offline changing of the UUID a couple of
years before that.

> This feature was added from
> the very beginning since in Large Installation System Administration
> (LISA) systems, a very common thing to do is to image boot disks from
> a "golden master", and then afterwards, you want to make sure the file
> systems on each boot disk have a unique UUID; and this is done via
> "tune2fs -U random /dev/sdXX".  Since I was working at MIT Project
> Athena at the time, we regularly did this when installing Athena
> client workstations, and when I added UUID support to ext2, I made
> sure this feature was well-supported.

See xfs_copy(8). This was a tool originally written, IIRC, in early
1995 for physically cloning sparse golden images in the SGI factory
production line. It was multi-threaded and could write up to 16 scsi
disks at once with a single ascending LBA order pass. The last thing
it does is change the UUID of each clone to make them unique.

There's nothing new here - this is all 30 years ago, and we've had
tools changing filesystems UUIDs for all this time.

> The tune2fs program allows the UUID to be changed via the file system
> is mounted (with some caveats), which it did by directly modifying the
> on-disk superblock.  Obviously, when it did that, it wouldn't change
> sb->s_uuid "dynamically", although the next time the file system was
> mounted, sb->s_uuid would get the new UUID.

Yes, which means for userspace and most of the kernel it's no
different to "unmount, change UUID, mount". It's effectively an
offline change, even if the on-disk superblock is changed while the
filesystem is mounted.

> If overlayfs and IMA are
> expecting that a file system's UUID would stay consant and persistent
> --- well, that's not true, and it has always been that way, since
> there are tools that make it trivially easy for a system administrator
> to adjust the UUID.

Yes, but that's not the point I've been making. My point is that the
*online change of sb->s_uuid* that was being proposed for the
XFS/generic variant of the ext4 online UUID change ioctl is
completely new, and that's where all the problems start....

> In addition to the LISA context, this feature is also commonly used in
> various cloud deployments, since when you create a new VM, it
> typically gets a new root file system, which is copied from a fixed,
> read-only image.  So on a particular hyperscale cloud system, if we
> didn't do anything special, there could be hundreds of thousands VM's
> whose root file system would all have the same UUID, which would mean
> that the UUID... isn't terribly unique.

Again, nothing new here - we've been using snapshots/clones/reflinks
for efficient VM storage provisioning for well over 15 years now.

.....

> This is the reason why we added the ext4 ioctl; it was intended for
> the express use of "tune2fs -U", and like tune2fs -U, it doesn't
> actually change sb->s_uuid; it only changes the on-disk superblock's
> UUID.  This was mostly because we forgot about sb->s_uuid, to be
> honest, but it means that regardless of whether "tune2fs -U" directly
> modifies the block device, or uses the ext4 ioctl, the behaviour with
> respect to sb->s_uuid is the same; it's not modified when the on-disk
> uuid is changed.

IOWs, not only was the ext4 functionality was poorly thought out, it
was *poorly implemented*.

So, let's take a step back here - we've done the use case thing to
death now - and consider what is it we actually need here?

All we need for the hyperscale/VM provisioning use case is for the
the UUID to be changed at first boot/mount time before anything else
happens.

So why do we need userspace to be involved in that? Indeed,
all the problems stem from needing to have userspace change the
UUID.

There's an obvious solution: a newly provisioned filesystem needs to
change the uuid at first mount. The only issue is the
kernel/filesystem doesn't know when the first mount is.

Darrick suggested "mount -o setuuid=xxxx" on #xfs earlier, but that
requires changing userspace init stuff and, well, I hate single use
case mount options like this.

However, we have a golden image that every client image is cloned
from. Say we set a special feature bit in that golden image that
means "need UUID regeneration". Then on the first mount of the
cloned image after provisioning, the filesystem sees the bit and
automatically regenerates the UUID with needing any help from
userspace at all.

Problem solved, yes? We don't need userspace to change the uuid on
first boot of the newly provisioned VM - the filesystem just makes
it happen.

If the "first run" init scripts are set up to run blkid to grab the
new uuid after mount and update whatever needs to be updated with
the new root filesystem UUID, then we've moved the entire problem
out of the VM boot path and back into the provisioning system where
it should be.

And then we don't need an ioctl to change UUIDs online, nor do we
require the VFS, kernel subsystems, userspace infrastructure and
applications to be capable of handling the UUID of a mounted
filesystem changing without warning....

> > > > The VFS does not guarantee in any way that it is safe to change the
> > > > sb->s_uuid (i.e. no locking, no change notifications, no udev
> > > > events, etc). Various subsystems - both in the kernel and in
> > > > userspace - use the sb->s_uuid as a canonical and/or persistent
> > > > filesystem/device identifier and are unprepared to have it change
> > > > while the filesystem is mounted and active.
> 
> Note that the last sentence is a bit ambiguous.

Well, yes, because while the UUID is normally persistent, if the
administrator chooses to modify the UUID while the filesystem is
unmounted, it will change between mounts.  In that case.....

> There is the question
> of whether sb->s_uuid won't change while the file system is mounted,
> and then there is the question of whether s_uuid is **persistent**
> ---- which is to say, that it won't change across mounts or reboots.
>
> If there are subsystems like IMA, overlayfs, pnfs, et.al, which expect
> that, I'm sorry, but sysadmin tools to make it trivially easy to
> change the file system UUID long-predate these other subsystems, and
> there *are* system adminsitrators --- particularly in the LISA or
> Cloud context --- which have used "tune2fs -U" for good and proper
> reasons.

.... it's on the sysadmins to understand they need to regenerate
anything that is reliant on the old filesystem UUIDs before mounting
the filesystem again to avoid these issues...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: uuid ioctl - was: Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-06-02  6:34                                     ` Dave Chinner
@ 2023-06-02 10:53                                       ` Amir Goldstein
  2023-06-02 13:52                                       ` Christian Brauner
  2023-06-02 14:58                                       ` Theodore Ts'o
  2 siblings, 0 replies; 61+ messages in thread
From: Amir Goldstein @ 2023-06-02 10:53 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Theodore Ts'o, Darrick J. Wong, Christian Brauner,
	Jeff Layton, miklos, linux-fsdevel, linux-xfs

On Fri, Jun 2, 2023 at 9:35 AM Dave Chinner <david@fromorbit.com> wrote:
>
> On Fri, Jun 02, 2023 at 12:27:14AM -0400, Theodore Ts'o wrote:
> > On Thu, Jun 01, 2023 at 06:23:35PM -0700, Darrick J. Wong wrote:
> > > Someone ought to cc Ted since I asked him about this topic this morning
> > > and he said he hadn't noticed it going by...
> > >
> > > > > > In addition the uuid should be set when the filesystem is mounted.
> > > > > > Unless the filesystem implements a dedicated ioctl() - like ext4 - to
> > > > > > change the uuid.
> > > > >
> > > > > IMO, that ext4 functionality is a landmine waiting to be stepped on.
> > > > >
> > > > > We should not be changing the sb->s_uuid of filesysetms dynamically.
> > > >
> > > > Yeah, I kinda agree. If it works for ext4 and it's an ext4 specific
> > > > ioctl then this is fine though.
> > >
> > > Now that Dave's brought up all kinds of questions about other parts of
> > > the kernel using s_uuid for things, I'm starting to think that even ext4
> > > shouldn't be changing its own uuid on the fly.
> >
> > So let's set some context here.  The tune2fs program in e2fsprogs has
> > supported changing the UUID for a *very* long time.  Specifically,
> > since September 7, 1996 (e2fsprogs version 1.05, when we first added
> > the UUID field in the ext2 superblock).
>
> Yup, and XFS has supported offline changing of the UUID a couple of
> years before that.
>
> > This feature was added from
> > the very beginning since in Large Installation System Administration
> > (LISA) systems, a very common thing to do is to image boot disks from
> > a "golden master", and then afterwards, you want to make sure the file
> > systems on each boot disk have a unique UUID; and this is done via
> > "tune2fs -U random /dev/sdXX".  Since I was working at MIT Project
> > Athena at the time, we regularly did this when installing Athena
> > client workstations, and when I added UUID support to ext2, I made
> > sure this feature was well-supported.
>
> See xfs_copy(8). This was a tool originally written, IIRC, in early
> 1995 for physically cloning sparse golden images in the SGI factory
> production line. It was multi-threaded and could write up to 16 scsi
> disks at once with a single ascending LBA order pass. The last thing
> it does is change the UUID of each clone to make them unique.
>
> There's nothing new here - this is all 30 years ago, and we've had
> tools changing filesystems UUIDs for all this time.
>
> > The tune2fs program allows the UUID to be changed via the file system
> > is mounted (with some caveats), which it did by directly modifying the
> > on-disk superblock.  Obviously, when it did that, it wouldn't change
> > sb->s_uuid "dynamically", although the next time the file system was
> > mounted, sb->s_uuid would get the new UUID.
>
> Yes, which means for userspace and most of the kernel it's no
> different to "unmount, change UUID, mount". It's effectively an
> offline change, even if the on-disk superblock is changed while the
> filesystem is mounted.
>
> > If overlayfs and IMA are
> > expecting that a file system's UUID would stay consant and persistent
> > --- well, that's not true, and it has always been that way, since
> > there are tools that make it trivially easy for a system administrator
> > to adjust the UUID.
>
> Yes, but that's not the point I've been making. My point is that the
> *online change of sb->s_uuid* that was being proposed for the
> XFS/generic variant of the ext4 online UUID change ioctl is
> completely new, and that's where all the problems start....
>
> > In addition to the LISA context, this feature is also commonly used in
> > various cloud deployments, since when you create a new VM, it
> > typically gets a new root file system, which is copied from a fixed,
> > read-only image.  So on a particular hyperscale cloud system, if we
> > didn't do anything special, there could be hundreds of thousands VM's
> > whose root file system would all have the same UUID, which would mean
> > that the UUID... isn't terribly unique.
>
> Again, nothing new here - we've been using snapshots/clones/reflinks
> for efficient VM storage provisioning for well over 15 years now.
>
> .....
>
> > This is the reason why we added the ext4 ioctl; it was intended for
> > the express use of "tune2fs -U", and like tune2fs -U, it doesn't
> > actually change sb->s_uuid; it only changes the on-disk superblock's
> > UUID.  This was mostly because we forgot about sb->s_uuid, to be
> > honest, but it means that regardless of whether "tune2fs -U" directly
> > modifies the block device, or uses the ext4 ioctl, the behaviour with
> > respect to sb->s_uuid is the same; it's not modified when the on-disk
> > uuid is changed.
>
> IOWs, not only was the ext4 functionality was poorly thought out, it
> was *poorly implemented*.
>
> So, let's take a step back here - we've done the use case thing to
> death now - and consider what is it we actually need here?
>
> All we need for the hyperscale/VM provisioning use case is for the
> the UUID to be changed at first boot/mount time before anything else
> happens.
>
> So why do we need userspace to be involved in that? Indeed,
> all the problems stem from needing to have userspace change the
> UUID.
>
> There's an obvious solution: a newly provisioned filesystem needs to
> change the uuid at first mount. The only issue is the
> kernel/filesystem doesn't know when the first mount is.
>
> Darrick suggested "mount -o setuuid=xxxx" on #xfs earlier, but that
> requires changing userspace init stuff and, well, I hate single use
> case mount options like this.
>
> However, we have a golden image that every client image is cloned
> from. Say we set a special feature bit in that golden image that
> means "need UUID regeneration". Then on the first mount of the
> cloned image after provisioning, the filesystem sees the bit and
> automatically regenerates the UUID with needing any help from
> userspace at all.
>
> Problem solved, yes? We don't need userspace to change the uuid on
> first boot of the newly provisioned VM - the filesystem just makes
> it happen.
>

I like this idea.

> If the "first run" init scripts are set up to run blkid to grab the
> new uuid after mount and update whatever needs to be updated with
> the new root filesystem UUID, then we've moved the entire problem
> out of the VM boot path and back into the provisioning system where
> it should be.
>

Seems to me like libblkid does not check for unknown feature bits:
https://github.com/util-linux/util-linux/blob/01a0a556018694bfaf6b01a5a40f8d0d10641a1f/libblkid/src/superblocks/xfs.c#L173
I wonder how systems will behave when libblkid examines this image
and finds a null UUID, without regarding the feature flag.
This is something that can be fixed in userspace, but may cause complications.

> And then we don't need an ioctl to change UUIDs online, nor do we
> require the VFS, kernel subsystems, userspace infrastructure and
> applications to be capable of handling the UUID of a mounted
> filesystem changing without warning....
>
> > > > > The VFS does not guarantee in any way that it is safe to change the
> > > > > sb->s_uuid (i.e. no locking, no change notifications, no udev
> > > > > events, etc). Various subsystems - both in the kernel and in
> > > > > userspace - use the sb->s_uuid as a canonical and/or persistent
> > > > > filesystem/device identifier and are unprepared to have it change
> > > > > while the filesystem is mounted and active.
> >
> > Note that the last sentence is a bit ambiguous.
>
> Well, yes, because while the UUID is normally persistent, if the
> administrator chooses to modify the UUID while the filesystem is
> unmounted, it will change between mounts.  In that case.....
>
> > There is the question
> > of whether sb->s_uuid won't change while the file system is mounted,
> > and then there is the question of whether s_uuid is **persistent**
> > ---- which is to say, that it won't change across mounts or reboots.
> >
> > If there are subsystems like IMA, overlayfs, pnfs, et.al, which expect
> > that, I'm sorry, but sysadmin tools to make it trivially easy to
> > change the file system UUID long-predate these other subsystems, and
> > there *are* system adminsitrators --- particularly in the LISA or
> > Cloud context --- which have used "tune2fs -U" for good and proper
> > reasons.
>
> .... it's on the sysadmins to understand they need to regenerate
> anything that is reliant on the old filesystem UUIDs before mounting
> the filesystem again to avoid these issues...
>

For the records, overlayfs looks at s_uuid to try and determine if the
underlying fs was swapped underneath it while overlayfs was offline.
It is sometimes allowed to swap the underlying fs, but overlayfs needs
to know about it.
s_uuid is used as part of a "persistent file handle" in a very similar way
that NFS clients use "fsid" for a unique file handle.

For the very basic overlayfs configuration, changing the lower fs uuid
will result in some overlayfs objects changing their inode numbers.
For overlayfs with opt-in index/nfs_export features, after changing the
underlying fs uuid, overlayfs could no longer be mounted with the same
layer configuration and those opt-in features enabled.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: uuid ioctl - was: Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-06-02  1:23                                 ` Darrick J. Wong
  2023-06-02  4:27                                   ` Theodore Ts'o
@ 2023-06-02 13:14                                   ` Christian Brauner
  1 sibling, 0 replies; 61+ messages in thread
From: Christian Brauner @ 2023-06-02 13:14 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Theodore Ts'o, Dave Chinner, Amir Goldstein, Jeff Layton,
	miklos, linux-fsdevel, linux-xfs

On Thu, Jun 01, 2023 at 06:23:35PM -0700, Darrick J. Wong wrote:
> Someone ought to cc Ted since I asked him about this topic this morning
> and he said he hadn't noticed it going by...

Fwiw, it wasn't intentional. I just dumped people from the old thread
not added new ones iirc.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: uuid ioctl - was: Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-06-02  6:34                                     ` Dave Chinner
  2023-06-02 10:53                                       ` Amir Goldstein
@ 2023-06-02 13:52                                       ` Christian Brauner
  2023-06-02 14:23                                         ` Darrick J. Wong
  2023-06-04 22:59                                         ` Dave Chinner
  2023-06-02 14:58                                       ` Theodore Ts'o
  2 siblings, 2 replies; 61+ messages in thread
From: Christian Brauner @ 2023-06-02 13:52 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Theodore Ts'o, Darrick J. Wong, Amir Goldstein, Jeff Layton,
	miklos, linux-fsdevel, linux-xfs

On Fri, Jun 02, 2023 at 04:34:58PM +1000, Dave Chinner wrote:
> On Fri, Jun 02, 2023 at 12:27:14AM -0400, Theodore Ts'o wrote:
> > On Thu, Jun 01, 2023 at 06:23:35PM -0700, Darrick J. Wong wrote:
> > > Someone ought to cc Ted since I asked him about this topic this morning
> > > and he said he hadn't noticed it going by...
> > > 
> > > > > > In addition the uuid should be set when the filesystem is mounted.
> > > > > > Unless the filesystem implements a dedicated ioctl() - like ext4 - to
> > > > > > change the uuid.
> > > > > 
> > > > > IMO, that ext4 functionality is a landmine waiting to be stepped on.
> > > > > 
> > > > > We should not be changing the sb->s_uuid of filesysetms dynamically.
> > > > 
> > > > Yeah, I kinda agree. If it works for ext4 and it's an ext4 specific
> > > > ioctl then this is fine though.
> > > 
> > > Now that Dave's brought up all kinds of questions about other parts of
> > > the kernel using s_uuid for things, I'm starting to think that even ext4
> > > shouldn't be changing its own uuid on the fly.
> > 
> > So let's set some context here.  The tune2fs program in e2fsprogs has
> > supported changing the UUID for a *very* long time.  Specifically,
> > since September 7, 1996 (e2fsprogs version 1.05, when we first added
> > the UUID field in the ext2 superblock).
> 
> Yup, and XFS has supported offline changing of the UUID a couple of
> years before that.
> 
> > This feature was added from
> > the very beginning since in Large Installation System Administration
> > (LISA) systems, a very common thing to do is to image boot disks from
> > a "golden master", and then afterwards, you want to make sure the file
> > systems on each boot disk have a unique UUID; and this is done via
> > "tune2fs -U random /dev/sdXX".  Since I was working at MIT Project
> > Athena at the time, we regularly did this when installing Athena
> > client workstations, and when I added UUID support to ext2, I made
> > sure this feature was well-supported.
> 
> See xfs_copy(8). This was a tool originally written, IIRC, in early
> 1995 for physically cloning sparse golden images in the SGI factory
> production line. It was multi-threaded and could write up to 16 scsi
> disks at once with a single ascending LBA order pass. The last thing
> it does is change the UUID of each clone to make them unique.
> 
> There's nothing new here - this is all 30 years ago, and we've had
> tools changing filesystems UUIDs for all this time.
> 
> > The tune2fs program allows the UUID to be changed via the file system
> > is mounted (with some caveats), which it did by directly modifying the
> > on-disk superblock.  Obviously, when it did that, it wouldn't change
> > sb->s_uuid "dynamically", although the next time the file system was
> > mounted, sb->s_uuid would get the new UUID.
> 
> Yes, which means for userspace and most of the kernel it's no
> different to "unmount, change UUID, mount". It's effectively an
> offline change, even if the on-disk superblock is changed while the
> filesystem is mounted.
> 
> > If overlayfs and IMA are
> > expecting that a file system's UUID would stay consant and persistent
> > --- well, that's not true, and it has always been that way, since
> > there are tools that make it trivially easy for a system administrator
> > to adjust the UUID.
> 
> Yes, but that's not the point I've been making. My point is that the
> *online change of sb->s_uuid* that was being proposed for the
> XFS/generic variant of the ext4 online UUID change ioctl is
> completely new, and that's where all the problems start....
> 
> > In addition to the LISA context, this feature is also commonly used in
> > various cloud deployments, since when you create a new VM, it
> > typically gets a new root file system, which is copied from a fixed,
> > read-only image.  So on a particular hyperscale cloud system, if we
> > didn't do anything special, there could be hundreds of thousands VM's
> > whose root file system would all have the same UUID, which would mean
> > that the UUID... isn't terribly unique.
> 
> Again, nothing new here - we've been using snapshots/clones/reflinks
> for efficient VM storage provisioning for well over 15 years now.
> 
> .....
> 
> > This is the reason why we added the ext4 ioctl; it was intended for
> > the express use of "tune2fs -U", and like tune2fs -U, it doesn't
> > actually change sb->s_uuid; it only changes the on-disk superblock's
> > UUID.  This was mostly because we forgot about sb->s_uuid, to be
> > honest, but it means that regardless of whether "tune2fs -U" directly
> > modifies the block device, or uses the ext4 ioctl, the behaviour with
> > respect to sb->s_uuid is the same; it's not modified when the on-disk
> > uuid is changed.
> 
> IOWs, not only was the ext4 functionality was poorly thought out, it
> was *poorly implemented*.
> 
> So, let's take a step back here - we've done the use case thing to
> death now - and consider what is it we actually need here?
> 
> All we need for the hyperscale/VM provisioning use case is for the
> the UUID to be changed at first boot/mount time before anything else
> happens.
> 
> So why do we need userspace to be involved in that? Indeed,
> all the problems stem from needing to have userspace change the
> UUID.
> 
> There's an obvious solution: a newly provisioned filesystem needs to
> change the uuid at first mount. The only issue is the
> kernel/filesystem doesn't know when the first mount is.
> 
> Darrick suggested "mount -o setuuid=xxxx" on #xfs earlier, but that
> requires changing userspace init stuff and, well, I hate single use
> case mount options like this.
> 
> However, we have a golden image that every client image is cloned
> from. Say we set a special feature bit in that golden image that
> means "need UUID regeneration". Then on the first mount of the
> cloned image after provisioning, the filesystem sees the bit and
> automatically regenerates the UUID with needing any help from
> userspace at all.
> 
> Problem solved, yes? We don't need userspace to change the uuid on
> first boot of the newly provisioned VM - the filesystem just makes
> it happen.

systemd-repart implements the following logic currently: If the GPT
*partition* and *disk* UUIDs are 0 then it will generate new UUIDs
before the first mount.

So for the *filesystem* UUID I think the golden image should either have
the UUID set to zero as well or to a special UUID. Either way, it would
mean the filesystem needs to generate a new UUID when it is mounted the
first time.

If we do this then all filesystems that support this should use the same
value to indicate "generate new UUID".

> 
> If the "first run" init scripts are set up to run blkid to grab the
> new uuid after mount and update whatever needs to be updated with
> the new root filesystem UUID, then we've moved the entire problem
> out of the VM boot path and back into the provisioning system where
> it should be.
> 
> And then we don't need an ioctl to change UUIDs online, nor do we

It also doesn't really help that much. What userspace would need is a
way to regenerate the filesystem UUID before the filesystem is mounted.
It doesn't help that much if you have to mount it first to change it...

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: uuid ioctl - was: Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-06-02 13:52                                       ` Christian Brauner
@ 2023-06-02 14:23                                         ` Darrick J. Wong
  2023-06-02 15:34                                           ` Christian Brauner
  2023-06-04 22:59                                         ` Dave Chinner
  1 sibling, 1 reply; 61+ messages in thread
From: Darrick J. Wong @ 2023-06-02 14:23 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Dave Chinner, Theodore Ts'o, Amir Goldstein, Jeff Layton,
	miklos, linux-fsdevel, linux-xfs

On Fri, Jun 02, 2023 at 03:52:16PM +0200, Christian Brauner wrote:
> On Fri, Jun 02, 2023 at 04:34:58PM +1000, Dave Chinner wrote:
> > On Fri, Jun 02, 2023 at 12:27:14AM -0400, Theodore Ts'o wrote:
> > > On Thu, Jun 01, 2023 at 06:23:35PM -0700, Darrick J. Wong wrote:
> > > > Someone ought to cc Ted since I asked him about this topic this morning
> > > > and he said he hadn't noticed it going by...
> > > > 
> > > > > > > In addition the uuid should be set when the filesystem is mounted.
> > > > > > > Unless the filesystem implements a dedicated ioctl() - like ext4 - to
> > > > > > > change the uuid.
> > > > > > 
> > > > > > IMO, that ext4 functionality is a landmine waiting to be stepped on.
> > > > > > 
> > > > > > We should not be changing the sb->s_uuid of filesysetms dynamically.
> > > > > 
> > > > > Yeah, I kinda agree. If it works for ext4 and it's an ext4 specific
> > > > > ioctl then this is fine though.
> > > > 
> > > > Now that Dave's brought up all kinds of questions about other parts of
> > > > the kernel using s_uuid for things, I'm starting to think that even ext4
> > > > shouldn't be changing its own uuid on the fly.
> > > 
> > > So let's set some context here.  The tune2fs program in e2fsprogs has
> > > supported changing the UUID for a *very* long time.  Specifically,
> > > since September 7, 1996 (e2fsprogs version 1.05, when we first added
> > > the UUID field in the ext2 superblock).
> > 
> > Yup, and XFS has supported offline changing of the UUID a couple of
> > years before that.
> > 
> > > This feature was added from
> > > the very beginning since in Large Installation System Administration
> > > (LISA) systems, a very common thing to do is to image boot disks from
> > > a "golden master", and then afterwards, you want to make sure the file
> > > systems on each boot disk have a unique UUID; and this is done via
> > > "tune2fs -U random /dev/sdXX".  Since I was working at MIT Project
> > > Athena at the time, we regularly did this when installing Athena
> > > client workstations, and when I added UUID support to ext2, I made
> > > sure this feature was well-supported.
> > 
> > See xfs_copy(8). This was a tool originally written, IIRC, in early
> > 1995 for physically cloning sparse golden images in the SGI factory
> > production line. It was multi-threaded and could write up to 16 scsi
> > disks at once with a single ascending LBA order pass. The last thing
> > it does is change the UUID of each clone to make them unique.
> > 
> > There's nothing new here - this is all 30 years ago, and we've had
> > tools changing filesystems UUIDs for all this time.
> > 
> > > The tune2fs program allows the UUID to be changed via the file system
> > > is mounted (with some caveats), which it did by directly modifying the
> > > on-disk superblock.  Obviously, when it did that, it wouldn't change
> > > sb->s_uuid "dynamically", although the next time the file system was
> > > mounted, sb->s_uuid would get the new UUID.
> > 
> > Yes, which means for userspace and most of the kernel it's no
> > different to "unmount, change UUID, mount". It's effectively an
> > offline change, even if the on-disk superblock is changed while the
> > filesystem is mounted.
> > 
> > > If overlayfs and IMA are
> > > expecting that a file system's UUID would stay consant and persistent
> > > --- well, that's not true, and it has always been that way, since
> > > there are tools that make it trivially easy for a system administrator
> > > to adjust the UUID.
> > 
> > Yes, but that's not the point I've been making. My point is that the
> > *online change of sb->s_uuid* that was being proposed for the
> > XFS/generic variant of the ext4 online UUID change ioctl is
> > completely new, and that's where all the problems start....
> > 
> > > In addition to the LISA context, this feature is also commonly used in
> > > various cloud deployments, since when you create a new VM, it
> > > typically gets a new root file system, which is copied from a fixed,
> > > read-only image.  So on a particular hyperscale cloud system, if we
> > > didn't do anything special, there could be hundreds of thousands VM's
> > > whose root file system would all have the same UUID, which would mean
> > > that the UUID... isn't terribly unique.
> > 
> > Again, nothing new here - we've been using snapshots/clones/reflinks
> > for efficient VM storage provisioning for well over 15 years now.
> > 
> > .....
> > 
> > > This is the reason why we added the ext4 ioctl; it was intended for
> > > the express use of "tune2fs -U", and like tune2fs -U, it doesn't
> > > actually change sb->s_uuid; it only changes the on-disk superblock's
> > > UUID.  This was mostly because we forgot about sb->s_uuid, to be
> > > honest, but it means that regardless of whether "tune2fs -U" directly
> > > modifies the block device, or uses the ext4 ioctl, the behaviour with
> > > respect to sb->s_uuid is the same; it's not modified when the on-disk
> > > uuid is changed.

...which means that anyone writing out non-ext4 ondisk metadata will now
be doing it with a stale fsuuid.  Er... that might just be an ext*
quirk that everyone will have to live with.

> > IOWs, not only was the ext4 functionality was poorly thought out, it
> > was *poorly implemented*.
> > 
> > So, let's take a step back here - we've done the use case thing to
> > death now - and consider what is it we actually need here?
> > 
> > All we need for the hyperscale/VM provisioning use case is for the
> > the UUID to be changed at first boot/mount time before anything else
> > happens.
> > 
> > So why do we need userspace to be involved in that? Indeed,
> > all the problems stem from needing to have userspace change the
> > UUID.
> > 
> > There's an obvious solution: a newly provisioned filesystem needs to
> > change the uuid at first mount. The only issue is the
> > kernel/filesystem doesn't know when the first mount is.
> > 
> > Darrick suggested "mount -o setuuid=xxxx" on #xfs earlier, but that
> > requires changing userspace init stuff and, well, I hate single use
> > case mount options like this.
> > 
> > However, we have a golden image that every client image is cloned
> > from. Say we set a special feature bit in that golden image that
> > means "need UUID regeneration". Then on the first mount of the
> > cloned image after provisioning, the filesystem sees the bit and
> > automatically regenerates the UUID with needing any help from
> > userspace at all.
> > 
> > Problem solved, yes? We don't need userspace to change the uuid on
> > first boot of the newly provisioned VM - the filesystem just makes
> > it happen.
> 
> systemd-repart implements the following logic currently: If the GPT
> *partition* and *disk* UUIDs are 0 then it will generate new UUIDs
> before the first mount.
> 
> So for the *filesystem* UUID I think the golden image should either have
> the UUID set to zero as well or to a special UUID. Either way, it would
> mean the filesystem needs to generate a new UUID when it is mounted the
> first time.
> 
> If we do this then all filesystems that support this should use the same
> value to indicate "generate new UUID".

Curiously, I noticed that blkid doesn't report the xfs uuid if it's all
zeroes:

# mkfs.xfs -f /dev/loop0 -m uuid=00000000-0000-0000-0000-000000000000

# blkid /dev/loop0
/dev/loop0: BLOCK_SIZE="512" TYPE="xfs"

Nor does udev create symlinks:

# ls /dev/disk/by-uuid/0*
ls: cannot access '/dev/disk/by-uuid/0*': No such file or directory

Nor does mounting by uuid work:

# mount UUID=00000000-0000-0000-0000-000000000000 /tmp/x
mount: /tmp/x: can't find UUID=00000000-0000-0000-0000-000000000000.

So I wonder if xfs even really needs a new superblock bit at all --
mounting via uuid doesn't work in the zeroed-uuid case, and the kernel
could indeed generate a new one at mount time before it populates
s_uuid, etc.  Then the initscripts can re-run blkid (or xfs_info) to
extract the new uuid and update config files as needed.

Though, the first-mount uuid would still break anything recorded in the
non-xfs metadata by the image creating system (such as evm attributes).
But at least that's on the image creator people to know that.

> > 
> > If the "first run" init scripts are set up to run blkid to grab the
> > new uuid after mount and update whatever needs to be updated with
> > the new root filesystem UUID, then we've moved the entire problem
> > out of the VM boot path and back into the provisioning system where
> > it should be.
> > 
> > And then we don't need an ioctl to change UUIDs online, nor do we
> 
> It also doesn't really help that much. What userspace would need is a
> way to regenerate the filesystem UUID before the filesystem is mounted.
> It doesn't help that much if you have to mount it first to change it...

<shrug> Well it's the rootfs where we want to change the uuid at
first-run time, and all the config info that needs updating is inside
the rootfs anyway.  If someone needs mount-by-uuid for the rootfs during
the first run or they require a specific uuid, they can still run
xfs_admin from within the initramfs.

--D

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: uuid ioctl - was: Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-06-02  6:34                                     ` Dave Chinner
  2023-06-02 10:53                                       ` Amir Goldstein
  2023-06-02 13:52                                       ` Christian Brauner
@ 2023-06-02 14:58                                       ` Theodore Ts'o
  2023-06-04 22:35                                         ` Dave Chinner
  2 siblings, 1 reply; 61+ messages in thread
From: Theodore Ts'o @ 2023-06-02 14:58 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Darrick J. Wong, Christian Brauner, Amir Goldstein, Jeff Layton,
	miklos, linux-fsdevel, linux-xfs

On Fri, Jun 02, 2023 at 04:34:58PM +1000, Dave Chinner wrote:
> IOWs, not only was the ext4 functionality was poorly thought out, it
> was *poorly implemented*.

Shrug.  It's 100% compatible with "tune2fs -U <uuid>" which existed
prior to sb->s_uuid and /proc/XXX/mountstats, and which has allowed
on-line, mounted changes of the UUID.  So as far as I'm concerned,
it's "working as intended".  It fixed a real bug where racing
resize2fs and tune2fs -U in separate systemd unit files could result
in superblock checksum failures, and it fixed the that issue.

It doesn't make any changes to how on-line "tune2fs -U <uuid>"
functioned, because the definition of s_uuid wasn't terribly well
defined (and "tune2fs -U" predates it in any case).  Originally s_uuid
was just to allow /proc/XXX/mountstats expose the UUID, but at this
point, I don't anyone has a complete understanding of other
assumptions of how overlayfs, IMA, and other userspace utilities have
in terms of the assumption of how file system UUID should be used and
what it denotes.

> However, we have a golden image that every client image is cloned
> from. Say we set a special feature bit in that golden image that
> means "need UUID regeneration". Then on the first mount of the
> cloned image after provisioning, the filesystem sees the bit and
> automatically regenerates the UUID with needing any help from
> userspace at all.

> Problem solved, yes? We don't need userspace to change the uuid on
> first boot of the newly provisioned VM - the filesystem just makes
> it happen.

I agree that's a good design --- and ten years now, from all of the
users using old versions of RHEL have finally migrated off to a
version of some enterprise linux that supports this new feature, the
cloud agents which are using "tune2fs -U <uuid>" or "xfs_admin -U
<uuid>" can stop relying on it and switching to this new scheme.

What we could do is to make it easy to determine whether the kernel
supports the "UUID regeneration" feature, and whether the file system
had its UUID regnerated (because some cloud images generated using an
older distro's installer won't request the UUID renegeration), so that
cloud agents (which are typically installed as a daemon that starts
out of an init.d or systemd unit file) will know whether or not they
need to fallback to the userspace UUID regeneration.

For cloud agents which are installed as a one-shop executable run out
of the initramfs, we might be able to change the UUID before the root
file system is mounted.  Of course, there are those userspace setups
where the use of an initramfs is optional or not used at all.

So for the short-term, we're going to be stuck with userspace mediated
UUID changes, and if there are going to be userspace or kernel
subsystems that are going to be surprised when UUID changes out from
under them.  So having some kind of documentation which describes how
various subsystems are using the file system UUID, and whether they
are getting it from sb->s_uuid, /proc/XXX/mountstats, or some other
source, that would probably be useful.  After all, system
administrators' access to "tune2fs -U" and "xfs_admin -U" isn't going
away, and if we're saying "it's up to them to understand the
implications", it's nice if we document the gotchas.   :-)

	       	    	       		    - Ted

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: uuid ioctl - was: Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-06-02 14:23                                         ` Darrick J. Wong
@ 2023-06-02 15:34                                           ` Christian Brauner
  0 siblings, 0 replies; 61+ messages in thread
From: Christian Brauner @ 2023-06-02 15:34 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Dave Chinner, Theodore Ts'o, Amir Goldstein, Jeff Layton,
	miklos, linux-fsdevel, linux-xfs

On Fri, Jun 02, 2023 at 07:23:29AM -0700, Darrick J. Wong wrote:
> On Fri, Jun 02, 2023 at 03:52:16PM +0200, Christian Brauner wrote:
> > On Fri, Jun 02, 2023 at 04:34:58PM +1000, Dave Chinner wrote:
> > > On Fri, Jun 02, 2023 at 12:27:14AM -0400, Theodore Ts'o wrote:
> > > > On Thu, Jun 01, 2023 at 06:23:35PM -0700, Darrick J. Wong wrote:
> > > > > Someone ought to cc Ted since I asked him about this topic this morning
> > > > > and he said he hadn't noticed it going by...
> > > > > 
> > > > > > > > In addition the uuid should be set when the filesystem is mounted.
> > > > > > > > Unless the filesystem implements a dedicated ioctl() - like ext4 - to
> > > > > > > > change the uuid.
> > > > > > > 
> > > > > > > IMO, that ext4 functionality is a landmine waiting to be stepped on.
> > > > > > > 
> > > > > > > We should not be changing the sb->s_uuid of filesysetms dynamically.
> > > > > > 
> > > > > > Yeah, I kinda agree. If it works for ext4 and it's an ext4 specific
> > > > > > ioctl then this is fine though.
> > > > > 
> > > > > Now that Dave's brought up all kinds of questions about other parts of
> > > > > the kernel using s_uuid for things, I'm starting to think that even ext4
> > > > > shouldn't be changing its own uuid on the fly.
> > > > 
> > > > So let's set some context here.  The tune2fs program in e2fsprogs has
> > > > supported changing the UUID for a *very* long time.  Specifically,
> > > > since September 7, 1996 (e2fsprogs version 1.05, when we first added
> > > > the UUID field in the ext2 superblock).
> > > 
> > > Yup, and XFS has supported offline changing of the UUID a couple of
> > > years before that.
> > > 
> > > > This feature was added from
> > > > the very beginning since in Large Installation System Administration
> > > > (LISA) systems, a very common thing to do is to image boot disks from
> > > > a "golden master", and then afterwards, you want to make sure the file
> > > > systems on each boot disk have a unique UUID; and this is done via
> > > > "tune2fs -U random /dev/sdXX".  Since I was working at MIT Project
> > > > Athena at the time, we regularly did this when installing Athena
> > > > client workstations, and when I added UUID support to ext2, I made
> > > > sure this feature was well-supported.
> > > 
> > > See xfs_copy(8). This was a tool originally written, IIRC, in early
> > > 1995 for physically cloning sparse golden images in the SGI factory
> > > production line. It was multi-threaded and could write up to 16 scsi
> > > disks at once with a single ascending LBA order pass. The last thing
> > > it does is change the UUID of each clone to make them unique.
> > > 
> > > There's nothing new here - this is all 30 years ago, and we've had
> > > tools changing filesystems UUIDs for all this time.
> > > 
> > > > The tune2fs program allows the UUID to be changed via the file system
> > > > is mounted (with some caveats), which it did by directly modifying the
> > > > on-disk superblock.  Obviously, when it did that, it wouldn't change
> > > > sb->s_uuid "dynamically", although the next time the file system was
> > > > mounted, sb->s_uuid would get the new UUID.
> > > 
> > > Yes, which means for userspace and most of the kernel it's no
> > > different to "unmount, change UUID, mount". It's effectively an
> > > offline change, even if the on-disk superblock is changed while the
> > > filesystem is mounted.
> > > 
> > > > If overlayfs and IMA are
> > > > expecting that a file system's UUID would stay consant and persistent
> > > > --- well, that's not true, and it has always been that way, since
> > > > there are tools that make it trivially easy for a system administrator
> > > > to adjust the UUID.
> > > 
> > > Yes, but that's not the point I've been making. My point is that the
> > > *online change of sb->s_uuid* that was being proposed for the
> > > XFS/generic variant of the ext4 online UUID change ioctl is
> > > completely new, and that's where all the problems start....
> > > 
> > > > In addition to the LISA context, this feature is also commonly used in
> > > > various cloud deployments, since when you create a new VM, it
> > > > typically gets a new root file system, which is copied from a fixed,
> > > > read-only image.  So on a particular hyperscale cloud system, if we
> > > > didn't do anything special, there could be hundreds of thousands VM's
> > > > whose root file system would all have the same UUID, which would mean
> > > > that the UUID... isn't terribly unique.
> > > 
> > > Again, nothing new here - we've been using snapshots/clones/reflinks
> > > for efficient VM storage provisioning for well over 15 years now.
> > > 
> > > .....
> > > 
> > > > This is the reason why we added the ext4 ioctl; it was intended for
> > > > the express use of "tune2fs -U", and like tune2fs -U, it doesn't
> > > > actually change sb->s_uuid; it only changes the on-disk superblock's
> > > > UUID.  This was mostly because we forgot about sb->s_uuid, to be
> > > > honest, but it means that regardless of whether "tune2fs -U" directly
> > > > modifies the block device, or uses the ext4 ioctl, the behaviour with
> > > > respect to sb->s_uuid is the same; it's not modified when the on-disk
> > > > uuid is changed.
> 
> ...which means that anyone writing out non-ext4 ondisk metadata will now
> be doing it with a stale fsuuid.  Er... that might just be an ext*
> quirk that everyone will have to live with.
> 
> > > IOWs, not only was the ext4 functionality was poorly thought out, it
> > > was *poorly implemented*.
> > > 
> > > So, let's take a step back here - we've done the use case thing to
> > > death now - and consider what is it we actually need here?
> > > 
> > > All we need for the hyperscale/VM provisioning use case is for the
> > > the UUID to be changed at first boot/mount time before anything else
> > > happens.
> > > 
> > > So why do we need userspace to be involved in that? Indeed,
> > > all the problems stem from needing to have userspace change the
> > > UUID.
> > > 
> > > There's an obvious solution: a newly provisioned filesystem needs to
> > > change the uuid at first mount. The only issue is the
> > > kernel/filesystem doesn't know when the first mount is.
> > > 
> > > Darrick suggested "mount -o setuuid=xxxx" on #xfs earlier, but that
> > > requires changing userspace init stuff and, well, I hate single use
> > > case mount options like this.
> > > 
> > > However, we have a golden image that every client image is cloned
> > > from. Say we set a special feature bit in that golden image that
> > > means "need UUID regeneration". Then on the first mount of the
> > > cloned image after provisioning, the filesystem sees the bit and
> > > automatically regenerates the UUID with needing any help from
> > > userspace at all.
> > > 
> > > Problem solved, yes? We don't need userspace to change the uuid on
> > > first boot of the newly provisioned VM - the filesystem just makes
> > > it happen.
> > 
> > systemd-repart implements the following logic currently: If the GPT
> > *partition* and *disk* UUIDs are 0 then it will generate new UUIDs
> > before the first mount.
> > 
> > So for the *filesystem* UUID I think the golden image should either have
> > the UUID set to zero as well or to a special UUID. Either way, it would
> > mean the filesystem needs to generate a new UUID when it is mounted the
> > first time.
> > 
> > If we do this then all filesystems that support this should use the same
> > value to indicate "generate new UUID".
> 
> Curiously, I noticed that blkid doesn't report the xfs uuid if it's all
> zeroes:
> 
> # mkfs.xfs -f /dev/loop0 -m uuid=00000000-0000-0000-0000-000000000000
> 
> # blkid /dev/loop0
> /dev/loop0: BLOCK_SIZE="512" TYPE="xfs"

You should use blkid -p btw because without -p blkid checks a cache
which is problematic.

> 
> Nor does udev create symlinks:
> 
> # ls /dev/disk/by-uuid/0*
> ls: cannot access '/dev/disk/by-uuid/0*': No such file or directory

Yeah, it can't because there's no uuid and zero is treated as "not set".

> 
> Nor does mounting by uuid work:
> 
> # mount UUID=00000000-0000-0000-0000-000000000000 /tmp/x
> mount: /tmp/x: can't find UUID=00000000-0000-0000-0000-000000000000.
> 
> So I wonder if xfs even really needs a new superblock bit at all --
> mounting via uuid doesn't work in the zeroed-uuid case, and the kernel
> could indeed generate a new one at mount time before it populates
> s_uuid, etc.  Then the initscripts can re-run blkid (or xfs_info) to
> extract the new uuid and update config files as needed.

Yeah, that's my proposal and it's closely mirrored on what we did for
systemd-repart:

6. Similarly, all existing partitions for which configuration files
   exist and which currently have an all-zero identifying UUID will be
   assigned a new UUID. This UUID is cryptographically hashed from a
   common seed value together with the partition type UUID (and a
   counter in case multiple partitions of the same type are defined),
   see below. The same is done for all partitions that are created anew.
   These assignments are done in memory only, too, the disk is not
   updated yet.

7. Similarly, if the disk's volume UUID is all zeroes it is also
   initialized, also cryptographically hashed from the same common seed
   value. This is done in memory only too.

[...]

9. The new partition table is finally written to disk. The kernel is
   asked to reread the partition table.

https://www.freedesktop.org/software/systemd/man/systemd-repart.service.html

> 
> Though, the first-mount uuid would still break anything recorded in the
> non-xfs metadata by the image creating system (such as evm attributes).
> But at least that's on the image creator people to know that.

Sure, but that's a generic userspace problem for any identifier relying
on or derived from the filesystem uuid. IOW, that's not really our
concern imho.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: uuid ioctl - was: Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-06-02 14:58                                       ` Theodore Ts'o
@ 2023-06-04 22:35                                         ` Dave Chinner
  0 siblings, 0 replies; 61+ messages in thread
From: Dave Chinner @ 2023-06-04 22:35 UTC (permalink / raw)
  To: Theodore Ts'o
  Cc: Darrick J. Wong, Christian Brauner, Amir Goldstein, Jeff Layton,
	miklos, linux-fsdevel, linux-xfs

On Fri, Jun 02, 2023 at 10:58:16AM -0400, Theodore Ts'o wrote:
> On Fri, Jun 02, 2023 at 04:34:58PM +1000, Dave Chinner wrote:
> > However, we have a golden image that every client image is cloned
> > from. Say we set a special feature bit in that golden image that
> > means "need UUID regeneration". Then on the first mount of the
> > cloned image after provisioning, the filesystem sees the bit and
> > automatically regenerates the UUID with needing any help from
> > userspace at all.
> 
> > Problem solved, yes? We don't need userspace to change the uuid on
> > first boot of the newly provisioned VM - the filesystem just makes
> > it happen.
> 
> I agree that's a good design --- and ten years now, from all of the
> users using old versions of RHEL have finally migrated off to a
> version of some enterprise linux that supports this new feature, the
> cloud agents which are using "tune2fs -U <uuid>" or "xfs_admin -U
> <uuid>" can stop relying on it and switching to this new scheme.

We're talking about building new infrastructure - regardless
of anything else in this discussion, existing software will always
do what existing software does.

As low level infrastructure designers, we have to think *10 years
ahead* and design for when the feature will be widespread. Designing
infrastructure with "we need a fix right now" in mind almost always
ends with poor results because the focus is "this thing right now"
instead of "how will this work when this gets deployed world-wide by
everyone"....

ext4 developers and the hyperscalers that employ them made a bad
decision due to short-termism. It's only right that the wider
community pushes back against propagating that bad decision into
generic code that everyone will have to live with for the next 20+
years.

We can do better.  We *should* be doing better.

> So for the short-term, we're going to be stuck with userspace mediated
> UUID changes, and if there are going to be userspace or kernel

No, "we" aren't stuck with whacky dynamic runtime ext4 UUID changes.
*ext4 developers* and _hyperscalers that have deployed this on ext4_
are stuck with this awful stuff.

Everyone else gets to learn from the mistakes that have been made,
and "we" will end up with a generic solution that is better and will
work on all filesystems that support UUIDs, including ext4.

-Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: uuid ioctl - was: Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-06-02 13:52                                       ` Christian Brauner
  2023-06-02 14:23                                         ` Darrick J. Wong
@ 2023-06-04 22:59                                         ` Dave Chinner
  2023-06-05 11:37                                           ` Christian Brauner
  1 sibling, 1 reply; 61+ messages in thread
From: Dave Chinner @ 2023-06-04 22:59 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Theodore Ts'o, Darrick J. Wong, Amir Goldstein, Jeff Layton,
	miklos, linux-fsdevel, linux-xfs

On Fri, Jun 02, 2023 at 03:52:16PM +0200, Christian Brauner wrote:
> On Fri, Jun 02, 2023 at 04:34:58PM +1000, Dave Chinner wrote:
> > On Fri, Jun 02, 2023 at 12:27:14AM -0400, Theodore Ts'o wrote:
> > > On Thu, Jun 01, 2023 at 06:23:35PM -0700, Darrick J. Wong wrote:
> > There's an obvious solution: a newly provisioned filesystem needs to
> > change the uuid at first mount. The only issue is the
> > kernel/filesystem doesn't know when the first mount is.
> > 
> > Darrick suggested "mount -o setuuid=xxxx" on #xfs earlier, but that
> > requires changing userspace init stuff and, well, I hate single use
> > case mount options like this.
> > 
> > However, we have a golden image that every client image is cloned
> > from. Say we set a special feature bit in that golden image that
> > means "need UUID regeneration". Then on the first mount of the
> > cloned image after provisioning, the filesystem sees the bit and
> > automatically regenerates the UUID with needing any help from
> > userspace at all.
> > 
> > Problem solved, yes? We don't need userspace to change the uuid on
> > first boot of the newly provisioned VM - the filesystem just makes
> > it happen.
> 
> systemd-repart implements the following logic currently: If the GPT
> *partition* and *disk* UUIDs are 0 then it will generate new UUIDs
> before the first mount.
> 
> So for the *filesystem* UUID I think the golden image should either have
> the UUID set to zero as well or to a special UUID. Either way, it would
> mean the filesystem needs to generate a new UUID when it is mounted the
> first time.
> 
> If we do this then all filesystems that support this should use the same
> value to indicate "generate new UUID".

Ok, the main problem here is that all existing filesystem
implementations don't consider a zero UUID special. If you do this
on an existing kernel, it won't do anything and will not throw any
errors. Now we have the problem that userspace infrastructure can't
rely on the kernel telling it that it doesn't support the
functionality it is relying on. i.e. we have a mounted filesystems
and now userspace has to detect and handle the fact it still needs
to change the filesystem UUID.

Further, if this is not handled properly, every root filesystem
having a zero or duplicate "special" UUID is a landmine for OS
kernel upgrades to trip over. i.e. upgrade from old, unsupported to
new supported kernel and the next boot regens the UUID unexpectedly
and breaks anything relying on the old UUID.

Hence the point of using a feature bit is that the kernel will
refuse to mount the filesysetm if it does not understand the feature
bit. This way we have a hard image deployment testing failure that people
building and deploying images will notice. Hence they can configure
the build scripts to use the correct "change uuid" mechanism
with older OS releases and can take appropriate action when building
"legacy OS" images.

Yes, distros and vendors can backport the feature bit support if
they want, and then deployment of up-to-date older OS releases will
work with this new infrastructure correctly. But that is not
guaranteed to happen, so we really need a hard failure for
unsupported kernels.

So, yeah, I really do think this needs to be driven by a filesystem
feature bit, not retrospectively defining a special UUID value to
trigger this upgrade behaviour...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: uuid ioctl - was: Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-06-04 22:59                                         ` Dave Chinner
@ 2023-06-05 11:37                                           ` Christian Brauner
  2023-06-05 14:36                                             ` Theodore Ts'o
  0 siblings, 1 reply; 61+ messages in thread
From: Christian Brauner @ 2023-06-05 11:37 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Theodore Ts'o, Darrick J. Wong, Amir Goldstein, Jeff Layton,
	miklos, linux-fsdevel, linux-xfs

On Mon, Jun 05, 2023 at 08:59:33AM +1000, Dave Chinner wrote:
> On Fri, Jun 02, 2023 at 03:52:16PM +0200, Christian Brauner wrote:
> > On Fri, Jun 02, 2023 at 04:34:58PM +1000, Dave Chinner wrote:
> > > On Fri, Jun 02, 2023 at 12:27:14AM -0400, Theodore Ts'o wrote:
> > > > On Thu, Jun 01, 2023 at 06:23:35PM -0700, Darrick J. Wong wrote:
> > > There's an obvious solution: a newly provisioned filesystem needs to
> > > change the uuid at first mount. The only issue is the
> > > kernel/filesystem doesn't know when the first mount is.
> > > 
> > > Darrick suggested "mount -o setuuid=xxxx" on #xfs earlier, but that
> > > requires changing userspace init stuff and, well, I hate single use
> > > case mount options like this.
> > > 
> > > However, we have a golden image that every client image is cloned
> > > from. Say we set a special feature bit in that golden image that
> > > means "need UUID regeneration". Then on the first mount of the
> > > cloned image after provisioning, the filesystem sees the bit and
> > > automatically regenerates the UUID with needing any help from
> > > userspace at all.
> > > 
> > > Problem solved, yes? We don't need userspace to change the uuid on
> > > first boot of the newly provisioned VM - the filesystem just makes
> > > it happen.
> > 
> > systemd-repart implements the following logic currently: If the GPT
> > *partition* and *disk* UUIDs are 0 then it will generate new UUIDs
> > before the first mount.
> > 
> > So for the *filesystem* UUID I think the golden image should either have
> > the UUID set to zero as well or to a special UUID. Either way, it would
> > mean the filesystem needs to generate a new UUID when it is mounted the
> > first time.
> > 
> > If we do this then all filesystems that support this should use the same
> > value to indicate "generate new UUID".
> 
> Ok, the main problem here is that all existing filesystem
> implementations don't consider a zero UUID special. If you do this
> on an existing kernel, it won't do anything and will not throw any
> errors. Now we have the problem that userspace infrastructure can't
> rely on the kernel telling it that it doesn't support the
> functionality it is relying on. i.e. we have a mounted filesystems
> and now userspace has to detect and handle the fact it still needs
> to change the filesystem UUID.
> 
> Further, if this is not handled properly, every root filesystem
> having a zero or duplicate "special" UUID is a landmine for OS
> kernel upgrades to trip over. i.e. upgrade from old, unsupported to
> new supported kernel and the next boot regens the UUID unexpectedly
> and breaks anything relying on the old UUID.
> 
> Hence the point of using a feature bit is that the kernel will
> refuse to mount the filesysetm if it does not understand the feature
> bit. This way we have a hard image deployment testing failure that people
> building and deploying images will notice. Hence they can configure
> the build scripts to use the correct "change uuid" mechanism
> with older OS releases and can take appropriate action when building
> "legacy OS" images.
> 
> Yes, distros and vendors can backport the feature bit support if
> they want, and then deployment of up-to-date older OS releases will
> work with this new infrastructure correctly. But that is not
> guaranteed to happen, so we really need a hard failure for
> unsupported kernels.
> 
> So, yeah, I really do think this needs to be driven by a filesystem
> feature bit, not retrospectively defining a special UUID value to
> trigger this upgrade behaviour...

Using a zero/special UUID would have made this usable for most
filesystems which allows userspace to more easily detect this. Using a
filesystem feature bit makes this a lot more fragmented between
filesystems.

But allowing to refuse being mounted on older kernels when the feature
bit is set and unknown can be quite useful. So this is also fine by me.

So, the protocol should be to create a filesystem with a zero UUID and
the new feature bit set. At the first mount the UUID will be generated.

Only thing I would really love to see is a short blurb about this in
Documentation/filesystems/uuid.rst so we have a reference point for how
we expect this to work and how a filesystem should implement this.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: uuid ioctl - was: Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-06-05 11:37                                           ` Christian Brauner
@ 2023-06-05 14:36                                             ` Theodore Ts'o
  2023-06-06  0:54                                               ` Dave Chinner
  0 siblings, 1 reply; 61+ messages in thread
From: Theodore Ts'o @ 2023-06-05 14:36 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Dave Chinner, Darrick J. Wong, Amir Goldstein, Jeff Layton,
	miklos, linux-fsdevel, linux-xfs

On Mon, Jun 05, 2023 at 01:37:40PM +0200, Christian Brauner wrote:
> Using a zero/special UUID would have made this usable for most
> filesystems which allows userspace to more easily detect this. Using a
> filesystem feature bit makes this a lot more fragmented between
> filesystems.

Not all file systems have feature bits.  So I'd suggest that how this
should be a file system specific implementation detail.  If with a
newer kernel, a file systems sets the UUID to a random value if it is
all zeros when it is mounted should be relatively simple.

However, there are some questions this brings up.  What should the
semantics be if a file system creates a file system-level snapshot ---
should the UUID be refreshed?  What if it is a block-level file system
snapshot using LVM --- should the UUID be refreshed in that case?

As I've been trying to point out, exactly what the semantics of a file
system level UUID has never been well defined, and it's not clear what
various subsystems are trying to *do* with the UUID.  And given that
what can happen with mount name spaces, bind mounts, etc., we should
ask whether the assumptions they are making with respect to UUID is in
fact something we should be encouraging.

> But allowing to refuse being mounted on older kernels when the feature
> bit is set and unknown can be quite useful. So this is also fine by me.

This pretty much guarantees people won't use the feature for a while.
People complain when a file system cann't be mounted.  Using a feature
bit is also very likely to mean that you won't be able to run an older
fsck on that file system --- for what users would complain would be no
good reason.  And arguably, they would be right to complain.

						- Ted

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: uuid ioctl - was: Re: [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes
  2023-06-05 14:36                                             ` Theodore Ts'o
@ 2023-06-06  0:54                                               ` Dave Chinner
  0 siblings, 0 replies; 61+ messages in thread
From: Dave Chinner @ 2023-06-06  0:54 UTC (permalink / raw)
  To: Theodore Ts'o
  Cc: Christian Brauner, Darrick J. Wong, Amir Goldstein, Jeff Layton,
	miklos, linux-fsdevel, linux-xfs

On Mon, Jun 05, 2023 at 10:36:38AM -0400, Theodore Ts'o wrote:
> On Mon, Jun 05, 2023 at 01:37:40PM +0200, Christian Brauner wrote:
> > Using a zero/special UUID would have made this usable for most
> > filesystems which allows userspace to more easily detect this. Using a
> > filesystem feature bit makes this a lot more fragmented between
> > filesystems.
> 
> Not all file systems have feature bits.  So I'd suggest that how this
> should be a file system specific implementation detail.  If with a
> newer kernel, a file systems sets the UUID to a random value if it is
> all zeros when it is mounted should be relatively simple.

Sure, but this is a *fs implementation detail*, not a user API
requirement.

If the filesysystem has feature bits, then it should use them, not
rely on zero UUID values because existing filesystems and/or images
could have zero values in them and the user may no want them to be
regenerated on mount.  That's a retrospective change of on-disk
format behaviour, and hence requires feature bits to manage....

> However, there are some questions this brings up.  What should the
> semantics be if a file system creates a file system-level snapshot ---
> should the UUID be refreshed?  What if it is a block-level file system
> snapshot using LVM --- should the UUID be refreshed in that case?

Engage your brain, Ted. Existing workflows with snapshots are
completely unchanged by this proposal. If you take a device level
snapshot and then want to mount it, you have to change the UUID
before it gets mounted..

Indeed, XFS will refuse to mount filesystems with duplicate UUIDs;
the admin has been forced to run xfs admin tools to regenerate the
UUID before mounting the snapshot image for the past 20+ years. Or
for pure read-only snapshots, they need to use "-o
ro,norecovery,nouuid" to allow a pure read-only mount with a
duplicate UUID. The "nouuid" mount otion has been around for almost
22 years:

commit 813e9410043e88b474b8b2b43c8d8e52ea90f155
Author: Steve Lord <lord@sgi.com>
Date:   Fri Jun 29 22:29:47 2001 +0000

    Add nouuid mount option

Either way, the admin has to manage UUIDs for device level
snapshots, and there is no change in that at all.

IOWs, there is no change to existing workflows because they already
require UUIDs to be directly manipulated by the user before or at
mount time for correct behaviour.

> As I've been trying to point out, exactly what the semantics of a file
> system level UUID has never been well defined, and it's not clear what
> various subsystems are trying to *do* with the UUID.  And given that
> what can happen with mount name spaces, bind mounts, etc., we should
> ask whether the assumptions they are making with respect to UUID is in
> fact something we should be encouraging.

We can't put that genie back in the bottle.

But it does raise a further interesting questions about sb->s_uuid:
is one uuid sufficient for a superblock? We have two specific use
cases here:

1. A uuid that uniquely identifies every filesystem (e.g. blkid,
   pnfs, /dev/disk/by-uuid/, etc)
2. A persistent, unchanging uuid that can be used to key persistent
   objects to the underlying filesystem (overlay, security xattrs,
   etc) regardless of snapshots, cloning, dedupe, etc.

We already have a solution to that problem in XFS, sbp->sb_uuid
is for case #1, sbp->sb_metauuid is for case #2 as every metadata
block in the filesystem is keyed with sbp->sb_metauuid. Both start out
the same at mkfs time, but if we then regenerate the filesystem
uuid, then only sbp->sb_uuid is changed. We do not rewrite metadata
with the new uuid, doing so would break snapshot/clone/dedupe in
shared filesystem images.

This is one of the things that the XFS online UUID change proposal
added - it allowed for userspace to query the sbp->sb_metauuid in
addition to the sbp->sb_uuid so that userspace init script
orchestration to make use of it for persistent userspace filesystem
objects rather than the sbp->s_uuid identifier....

> > But allowing to refuse being mounted on older kernels when the feature
> > bit is set and unknown can be quite useful. So this is also fine by me.
> 
> This pretty much guarantees people won't use the feature for a while.

Perfectly fine by me. Those that need it will backport/upgrade both
userspace and kernels immediately, and they reap the benefits
immediately. Everyone else gets it as distros roll out with the
functionality enabled and fully supported across the toolchain.

This is how all new feature additions work, so I'm not sure why you
think this is a reason not to use a feature bit...

> People complain when a file system cann't be mounted.  Using a feature
> bit is also very likely to mean that you won't be able to run an older
> fsck on that file system --- for what users would complain would be no
> good reason.  And arguably, they would be right to complain.

In general, yes, but this is *not a general case*.

If you have a golden image with the feature bit set, why would you
ever run a fsck that doesn't support the feature bit on it? You have
to have a tool chain that supports the feature bit to set it in the
first place.

And If the feature bit is set, then you must be running client kernels
that support it (and will clear it on first mount), so once the
client system is running, the feature bit will never be set and so
the toolchain in the client OS just doesn't matter at all.

There is literally no other use case for this feature, so arguing
about generalities that simply don't apply to the specific use case
really isn't that helpful.

As a result, I don't see that there are any concerns about using a
feature bit at all, yet I see substantial benefit from not
retropsectively redefining a special on-disk UUID value that
silently drives new kernel behaviour.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 61+ messages in thread

end of thread, other threads:[~2023-06-06  0:55 UTC | newest]

Thread overview: 61+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-04-05 17:14 [PATCH] overlayfs: Trigger file re-evaluation by IMA / EVM after writes Stefan Berger
2023-04-06 10:26 ` Christian Brauner
2023-04-06 14:05   ` Paul Moore
2023-04-06 14:20     ` Stefan Berger
2023-04-06 14:36       ` Paul Moore
2023-04-06 15:01         ` Christian Brauner
2023-04-06 18:46           ` Jeff Layton
2023-04-06 19:11             ` Stefan Berger
2023-04-06 19:37               ` Jeff Layton
2023-04-06 20:22                 ` Stefan Berger
2023-04-06 21:24                   ` Jeff Layton
2023-04-06 21:58                     ` Stefan Berger
2023-04-06 22:09                       ` Jeff Layton
2023-04-06 22:04                     ` Jeff Layton
2023-04-06 22:27                       ` Stefan Berger
2023-04-07  8:31                       ` Christian Brauner
2023-04-07 13:29                         ` Jeff Layton
2023-04-09 15:22                           ` Christian Brauner
2023-04-09 22:12                             ` Jeff Layton
2023-04-11  8:38                               ` Christian Brauner
2023-04-11  9:32                                 ` Jeff Layton
2023-04-11  9:49                                   ` Christian Brauner
2023-04-11 10:13                                     ` Jeff Layton
2023-04-11 14:08                                       ` Christian Brauner
2023-04-21 14:55                                 ` Mimi Zohar
2023-04-17  1:57                           ` Stefan Berger
2023-04-17  8:11                             ` Christian Brauner
2023-04-17 10:05                             ` Jeff Layton
2023-04-17 12:45                               ` Stefan Berger
2023-04-17 13:18                                 ` Jeff Layton
2023-04-21 14:43                           ` Mimi Zohar
2023-05-18 20:46                             ` Paul Moore
2023-05-18 20:50                               ` Mimi Zohar
2023-05-19 14:58                                 ` Paul Moore
2023-05-25 14:43                                   ` Mimi Zohar
2023-05-19 19:42                         ` Mimi Zohar
2023-05-20  9:15                           ` Amir Goldstein
2023-05-22 12:18                             ` Mimi Zohar
2023-05-22 14:00                               ` Amir Goldstein
2023-05-23 19:38                                 ` Mimi Zohar
2023-05-20  9:17                           ` Christian Brauner
2023-05-21 22:49                             ` Dave Chinner
2023-05-22 10:50                               ` uuid ioctl - was: " Christian Brauner
2023-06-02  1:23                                 ` Darrick J. Wong
2023-06-02  4:27                                   ` Theodore Ts'o
2023-06-02  6:34                                     ` Dave Chinner
2023-06-02 10:53                                       ` Amir Goldstein
2023-06-02 13:52                                       ` Christian Brauner
2023-06-02 14:23                                         ` Darrick J. Wong
2023-06-02 15:34                                           ` Christian Brauner
2023-06-04 22:59                                         ` Dave Chinner
2023-06-05 11:37                                           ` Christian Brauner
2023-06-05 14:36                                             ` Theodore Ts'o
2023-06-06  0:54                                               ` Dave Chinner
2023-06-02 14:58                                       ` Theodore Ts'o
2023-06-04 22:35                                         ` Dave Chinner
2023-06-02 13:14                                   ` Christian Brauner
2023-05-23 17:35                               ` Mimi Zohar
2023-04-17 14:07                       ` Stefan Berger
2023-04-07  6:42                   ` Amir Goldstein
2023-04-06 16:10         ` Stefan Berger

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.