Re: [RFC PATCH 3/3] ovl: implement stacked mmap for shared map

From: cgxu <cgxu519@mykernel.net>
To: Amir Goldstein <amir73il@gmail.com>
Cc: overlayfs <linux-unionfs@vger.kernel.org>,
	Linux MM <linux-mm@kvack.org>, Miklos Szeredi <miklos@szeredi.hu>,
	Andrew Morton <akpm@linux-foundation.org>,
	Ritesh Harjani <riteshh@linux.ibm.com>
Subject: Re: [RFC PATCH 3/3] ovl: implement stacked mmap for shared map
Date: Mon, 31 Aug 2020 21:47:07 +0800	[thread overview]
Message-ID: <e1e2c8f0-a3b8-0a3d-3093-6188b1a829f0@mykernel.net> (raw)
In-Reply-To: <CAOQ4uxisdtoccDoQe_fYUA-jXTfy0yk=gNcMSrmbkCYaeOEPuQ@mail.gmail.com>

On 8/30/20 7:33 PM, Amir Goldstein wrote:
> On Sat, Aug 29, 2020 at 12:51 PM Chengguang Xu <cgxu519@mykernel.net> wrote:
>>
>> Implement stacked mmap for shared map to keep data
>> consistency.
>>
>> Signed-off-by: Chengguang Xu <cgxu519@mykernel.net>
>> ---
>>   fs/overlayfs/file.c | 120 +++++++++++++++++++++++++++++++++++++++++---
>>   1 file changed, 114 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
>> index 14ab5344a918..db5ab200d984 100644
>> --- a/fs/overlayfs/file.c
>> +++ b/fs/overlayfs/file.c
>> @@ -21,9 +21,17 @@ struct ovl_aio_req {
>>          struct fd fd;
>>   };
>>
>> +static vm_fault_t ovl_fault(struct vm_fault *vmf);
>> +static vm_fault_t ovl_page_mkwrite(struct vm_fault *vmf);
>> +
>> +static const struct vm_operations_struct ovl_vm_ops = {
>> +       .fault          = ovl_fault,
>> +       .page_mkwrite   = ovl_page_mkwrite,
>> +};
>> +
> 
> Interesting direction, not sure if this is workable.
> I don't know enough about mm to say.
> 
> But what about the rest of the operations?
> Did you go over them and decide that overlay doesn't need to implement them?
> I doubt it, but if you did, please document that.

I did some check for rest of them, IIUC ->fault will be enough for this 
special case (shared read-only mmap with no upper), I will remove 
->page_mkwrite in v2.

# I do not consider support ->huge_fault in current stage due to many fs 
cannot support DAX properly.

BTW, do you know who should I add to CC list for further deep review of
this code? fadevel-list?

> 
>>   struct ovl_file_entry {
>>          struct file *realfile;
>> -       void *vm_ops;
>> +       const struct vm_operations_struct *vm_ops;
>>   };
>>
>>   struct file *ovl_get_realfile(struct file *file)
>> @@ -40,14 +48,15 @@ void ovl_set_realfile(struct file *file, struct file *realfile)
>>          ofe->realfile = realfile;
>>   }
>>
>> -void *ovl_get_real_vmops(struct file *file)
>> +const struct vm_operations_struct *ovl_get_real_vmops(struct file *file)
>>   {
>>          struct ovl_file_entry *ofe = file->private_data;
>>
>>          return ofe->vm_ops;
>>   }
>>
>> -void ovl_set_real_vmops(struct file *file, void *vm_ops)
>> +void ovl_set_real_vmops(struct file *file,
>> +                       const struct vm_operations_struct *vm_ops)
>>   {
>>          struct ovl_file_entry *ofe = file->private_data;
>>
>> @@ -493,11 +502,104 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
>>          return ret;
>>   }
>>
>> +vm_fault_t ovl_fault(struct vm_fault *vmf)
>> +{
>> +       struct vm_area_struct *vma = vmf->vma;
>> +       struct file *file = vma->vm_file;
>> +       struct file *realfile;
>> +       struct file *fpin, *tmp;
>> +       struct inode *inode = file_inode(file);
>> +       struct inode *realinode;
>> +       const struct cred *old_cred;
>> +       bool retry_allowed;
>> +       vm_fault_t ret;
>> +       int err = 0;
>> +
>> +       if (fault_flag_check(vmf, FAULT_FLAG_TRIED)) {
>> +               realfile = ovl_get_realfile(file);
>> +
>> +               if (!ovl_has_upperdata(inode) ||
>> +                   realfile->f_inode != ovl_inode_upper(inode) ||
>> +                   !realfile->f_op->mmap)
>> +                       return VM_FAULT_SIGBUS;
>> +
>> +               if (!ovl_get_real_vmops(file)) {
>> +                       old_cred = ovl_override_creds(inode->i_sb);
>> +                       err = call_mmap(realfile, vma);
>> +                       revert_creds(old_cred);
>> +
>> +                       vma->vm_file = file;
>> +                       if (err) {
>> +                               vma->vm_ops = &ovl_vm_ops;
>> +                               return VM_FAULT_SIGBUS;
>> +                       }
>> +                       ovl_set_real_vmops(file, vma->vm_ops);
>> +                       vma->vm_ops = &ovl_vm_ops;
>> +               }
>> +
>> +               retry_allowed = fault_flag_check(vmf, FAULT_FLAG_ALLOW_RETRY);
>> +               if (retry_allowed)
>> +                       vma->vm_flags &= ~FAULT_FLAG_ALLOW_RETRY;
>> +               vma->vm_file = realfile;
>> +               ret = ovl_get_real_vmops(file)->fault(vmf);
>> +               vma->vm_file = file;
>> +               if (retry_allowed)
>> +                       vma->vm_flags |= FAULT_FLAG_ALLOW_RETRY;
>> +               return ret;
>> +
>> +       } else {
>> +               fpin = maybe_unlock_mmap_for_io(vmf, NULL);
>> +               if (!fpin)
>> +                       return VM_FAULT_SIGBUS;
>> +
>> +               ret = VM_FAULT_RETRY;
>> +               if (!ovl_has_upperdata(inode)) {
>> +                       err = ovl_copy_up_with_data(file->f_path.dentry);
>> +                       if (err)
>> +                               goto out;
>> +               }
>> +
>> +               realinode = ovl_inode_realdata(inode);
>> +               realfile = ovl_open_realfile(file, realinode);
>> +               if (IS_ERR(realfile))
>> +                       goto out;
>> +
>> +               tmp = ovl_get_realfile(file);
>> +               ovl_set_realfile(file, realfile);
>> +               fput(tmp);
>> +
>> +out:
>> +               fput(fpin);
>> +               return ret;
>> +       }
>> +}
> 
> 
> Please add some documentation to explain the method used.
> Do we need to retry if real_vmops are already set?
> 

Good catch, actually retry is not needed in that case.

Basically, we unlock(mmap_lock)->copy-up->open when
detecting no upper inode then retry fault operation.
However, we need to check fault retry flag carefully
for avoiding endless retry.

I'll add more explanation in v2.

---
cgxu