Re: [PATCH v19 16/27] x86/sgx: Add the Linux SGX Enclave Driver

From: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
To: Sean Christopherson <sean.j.christopherson@intel.com>
Cc: x86@kernel.org, linux-sgx@vger.kernel.org,
	akpm@linux-foundation.org, dave.hansen@intel.com,
	nhorman@redhat.com, npmccallum@redhat.com, serge.ayoun@intel.com,
	shay.katz-zamir@intel.com, haitao.huang@intel.com,
	andriy.shevchenko@linux.intel.com, tglx@linutronix.de,
	kai.svahn@intel.com, bp@alien8.de, josh@joshtriplett.org,
	luto@kernel.org, kai.huang@intel.com, rientjes@google.com,
	Suresh Siddha <suresh.b.siddha@intel.com>
Subject: Re: [PATCH v19 16/27] x86/sgx: Add the Linux SGX Enclave Driver
Date: Thu, 21 Mar 2019 17:51:11 +0200	[thread overview]
Message-ID: <20190321155111.GR4603@linux.intel.com> (raw)
In-Reply-To: <20190319211951.GI25575@linux.intel.com>

On Tue, Mar 19, 2019 at 02:19:51PM -0700, Sean Christopherson wrote:
> IMO we should get rid of SGX_POWER_LOST_ENCLAVE and the SUSPEND flag.
> 
>   - Userspace needs to handle -EFAULT cleanly even if we hook into
>     suspend/hibernate via sgx_encl_pm_notifier(), e.g. to handle virtual
>     machine migration.
>   - In the event that suspend is canceled after sgx_encl_pm_notifier()
>     runs, we'll have prematurely invalidated the enclave.
>   - Invalidating all enclaves could be slow on a system with GBs of EPC,
>     i.e. probably not the best thing to do in the suspend path.
> 
> Removing SGX_POWER_LOST_ENCLAVE means we can drop all of the pm_notifier()
> code, which will likely save us a bit of maintenance down the line.

I don't disgree. Isn't it a racy flag in the VM context i.e. because
suspend can happen without SGX core noticing it (running inside a VM)?
That would a bug.

> > +
> > +/**
> > + * struct sgx_enclave_create - parameter structure for the
> > + *                             %SGX_IOC_ENCLAVE_CREATE ioctl
> > + * @src:	address for the SECS page data
> > + */
> > +struct sgx_enclave_create  {
> > +	__u64	src;
> > +};
> 
> ...
> 
> > diff --git a/arch/x86/kernel/cpu/sgx/driver/driver.h b/arch/x86/kernel/cpu/sgx/driver/driver.h
> > new file mode 100644
> > index 000000000000..b736411b5317
> > --- /dev/null
> > +++ b/arch/x86/kernel/cpu/sgx/driver/driver.h
> > @@ -0,0 +1,38 @@
> > +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
> > +/**
> > + * Copyright(c) 2016-19 Intel Corporation.
> > + */
> > +#ifndef __ARCH_INTEL_SGX_H__
> > +#define __ARCH_INTEL_SGX_H__
> > +
> > +#include <crypto/hash.h>
> > +#include <linux/kref.h>
> > +#include <linux/mmu_notifier.h>
> > +#include <linux/radix-tree.h>
> > +#include <linux/rwsem.h>
> > +#include <linux/sched.h>
> > +#include <linux/workqueue.h>
> > +#include <uapi/asm/sgx.h>
> > +#include "../arch.h"
> > +#include "../encl.h"
> > +#include "../encls.h"
> > +#include "../sgx.h"
> 
> Yuck.  If we remove the driver specific Makefile then we can eliminate
> the "../" prefix here.  E.g. in the main SGX Makefile:
> 
> obj-$(CONFIG_INTEL_SGX_DRIVER) += driver/main.o driver/ioctl.o

I think this is a great idea.

> 
> > +#define SGX_DRV_NR_DEVICES	1
> > +#define SGX_EINIT_SPIN_COUNT	20
> > +#define SGX_EINIT_SLEEP_COUNT	50
> > +#define SGX_EINIT_SLEEP_TIME	20
> > +
> > +extern struct workqueue_struct *sgx_encl_wq;
> > +extern u64 sgx_encl_size_max_32;
> > +extern u64 sgx_encl_size_max_64;
> > +extern u32 sgx_misc_reserved_mask;
> > +extern u64 sgx_attributes_reserved_mask;
> > +extern u64 sgx_xfrm_reserved_mask;
> > +extern u32 sgx_xsave_size_tbl[64];
> > +
> > +extern const struct file_operations sgx_fs_provision_fops;
> > +
> > +long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
> > +
> > +#endif /* __ARCH_X86_INTEL_SGX_H__ */
> > diff --git a/arch/x86/kernel/cpu/sgx/driver/ioctl.c b/arch/x86/kernel/cpu/sgx/driver/ioctl.c
> > new file mode 100644
> > index 000000000000..4b9a91b53b50
> > --- /dev/null
> > +++ b/arch/x86/kernel/cpu/sgx/driver/ioctl.c
> 
> ...
> 
> > +static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long addr,
> > +			     void *data, struct sgx_secinfo *secinfo,
> > +			     unsigned int mrmask)
> > +{
> > +	u64 page_type = secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK;
> > +	struct sgx_encl_page *encl_page;
> > +	int ret;
> > +
> > +	if (sgx_validate_secinfo(secinfo))
> > +		return -EINVAL;
> > +	if (page_type == SGX_SECINFO_TCS) {
> 
> Should we validate page_type itself?

Yes I don't see why not.

> 
> > +		ret = sgx_validate_tcs(encl, data);
> > +		if (ret)
> > +			return ret;
> > +	}
> > +
> > +	mutex_lock(&encl->lock);
> > +
> > +	if (encl->flags & (SGX_ENCL_INITIALIZED | SGX_ENCL_DEAD)) {
> > +		ret = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	encl_page = sgx_encl_page_alloc(encl, addr);
> > +	if (IS_ERR(encl_page)) {
> > +		ret = PTR_ERR(encl_page);
> > +		goto out;
> > +	}
> > +
> > +	ret = __sgx_encl_add_page(encl, encl_page, data, secinfo, mrmask);
> > +	if (ret) {
> > +		radix_tree_delete(&encl_page->encl->page_tree,
> > +				  PFN_DOWN(encl_page->desc));
> > +		kfree(encl_page);
> > +	}
> > +
> > +out:
> > +	mutex_unlock(&encl->lock);
> > +	return ret;
> > +}
> 
> ...
> 
> > +static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
> > +			 struct sgx_einittoken *token)
> > +{
> > +	u64 mrsigner[4];
> > +	int ret;
> > +	int i;
> > +	int j;
> 
> These can be on a single line, e.g.: "int ret, i, j;".

I tend to put always all declarations to their own lines whenever I
write any code. When I wear my maintainer hat I tend to accept either
style for new code.

> 
> > +
> > +	/* Check that the required attributes have been authorized. */
> > +	if (encl->secs_attributes & ~encl->allowed_attributes)
> > +		return -EINVAL;
> > +
> > +	ret = sgx_get_key_hash(sigstruct->modulus, mrsigner);
> > +	if (ret)
> > +		return ret;
> > +
> > +	flush_work(&encl->work);
> > +
> > +	mutex_lock(&encl->lock);
> > +
> > +	if (encl->flags & SGX_ENCL_INITIALIZED)
> > +		goto err_out;
> > +
> > +	if (encl->flags & SGX_ENCL_DEAD) {
> > +		ret = -EFAULT;
> > +		goto err_out;
> > +	}
> > +
> > +	for (i = 0; i < SGX_EINIT_SLEEP_COUNT; i++) {
> > +		for (j = 0; j < SGX_EINIT_SPIN_COUNT; j++) {
> > +			ret = sgx_einit(sigstruct, token, encl->secs.epc_page,
> > +					mrsigner);
> > +			if (ret == SGX_UNMASKED_EVENT)
> > +				continue;
> > +			else
> > +				break;
> > +		}
> > +
> > +		if (ret != SGX_UNMASKED_EVENT)
> > +			break;
> > +
> > +		msleep_interruptible(SGX_EINIT_SLEEP_TIME);
> > +
> > +		if (signal_pending(current)) {
> > +			ret = -ERESTARTSYS;
> > +			goto err_out;
> > +		}
> > +	}
> > +
> > +	if (encls_faulted(ret)) {
> > +		if (encls_failed(ret))
> > +			ENCLS_WARN(ret, "EINIT");
> > +
> > +		sgx_encl_destroy(encl);
> > +		ret = -EFAULT;
> > +	} else if (encls_returned_code(ret)) {
> > +		pr_debug("EINIT returned %d\n", ret);
> > +	} else {
> > +		encl->flags |= SGX_ENCL_INITIALIZED;
> > +	}
> > +
> > +err_out:
> > +	mutex_unlock(&encl->lock);
> > +	return ret;
> > +}
> 
> ...
> 
> > diff --git a/arch/x86/kernel/cpu/sgx/driver/main.c b/arch/x86/kernel/cpu/sgx/driver/main.c
> > new file mode 100644
> > index 000000000000..16f36cd0af04
> > --- /dev/null
> > +++ b/arch/x86/kernel/cpu/sgx/driver/main.c
> > @@ -0,0 +1,290 @@
> > +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
> > +// Copyright(c) 2016-18 Intel Corporation.
> > +
> > +#include <linux/acpi.h>
> > +#include <linux/cdev.h>
> > +#include <linux/mman.h>
> > +#include <linux/platform_device.h>
> > +#include <linux/security.h>
> > +#include <linux/suspend.h>
> > +#include <asm/traps.h>
> > +#include "driver.h"
> > +
> > +MODULE_DESCRIPTION("Intel SGX Enclave Driver");
> > +MODULE_AUTHOR("Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>");
> > +MODULE_LICENSE("Dual BSD/GPL");
> 
> ...
> 
> > +static const struct file_operations sgx_ctrl_fops = {
> 
> Why sgx_ctrl_fops instead of e.g. sgx_dev_fops, sgx_device_fops,
> sgx_driver_fops, etc...

sgx_dev_fops would be a better name.

> 
> > +	.owner			= THIS_MODULE,
> > +	.unlocked_ioctl		= sgx_ioctl,
> > +#ifdef CONFIG_COMPAT
> > +	.compat_ioctl		= sgx_compat_ioctl,
> > +#endif
> > +	.mmap			= sgx_mmap,
> > +	.get_unmapped_area	= sgx_get_unmapped_area,
> > +};
> > +
> > +static struct bus_type sgx_bus_type = {
> > +	.name	= "sgx",
> > +};
> > +
> > +static char *sgx_devnode(struct device *dev, umode_t *mode,
> > +			 kuid_t *uid, kgid_t *gid)
> > +{
> > +	if (mode)
> > +		*mode = 0666;
> > +	return kasprintf(GFP_KERNEL, "sgx");
> > +}
> > +
> > +static const struct device_type sgx_device_type = {
> > +	.name = "sgx",
> > +	.devnode = sgx_devnode,
> > +};
> > +
> > +struct sgx_dev_ctx {
> > +	struct device ctrl_dev;
> > +	struct cdev ctrl_cdev;
> > +};
> > +
> > +static dev_t sgx_devt;
> > +
> > +static void sgx_dev_release(struct device *dev)
> > +{
> > +	struct sgx_dev_ctx *ctx = container_of(dev, struct sgx_dev_ctx,
> > +					       ctrl_dev);
> > +
> > +	kfree(ctx);
> > +}
> > +
> > +static struct sgx_dev_ctx *sgx_dev_ctx_alloc(struct device *parent)
> > +{
> > +	struct sgx_dev_ctx *ctx;
> > +	int ret;
> > +
> > +	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
> > +	if (!ctx)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	device_initialize(&ctx->ctrl_dev);
> > +
> > +	ctx->ctrl_dev.bus = &sgx_bus_type;
> > +	ctx->ctrl_dev.type = &sgx_device_type;
> > +	ctx->ctrl_dev.parent = parent;
> > +	ctx->ctrl_dev.devt = MKDEV(MAJOR(sgx_devt), 0);
> > +	ctx->ctrl_dev.release = sgx_dev_release;
> > +
> > +	ret = dev_set_name(&ctx->ctrl_dev, "sgx");
> > +	if (ret) {
> > +		put_device(&ctx->ctrl_dev);
> > +		return ERR_PTR(ret);
> > +	}
> > +
> > +	cdev_init(&ctx->ctrl_cdev, &sgx_ctrl_fops);
> > +	ctx->ctrl_cdev.owner = THIS_MODULE;
> > +
> > +	dev_set_drvdata(parent, ctx);
> > +
> > +	return ctx;
> > +}
> > +
> > +static struct sgx_dev_ctx *sgxm_dev_ctx_alloc(struct device *parent)
> > +{
> > +	struct sgx_dev_ctx *ctx;
> > +	int rc;
> > +
> > +	ctx = sgx_dev_ctx_alloc(parent);
> > +	if (IS_ERR(ctx))
> > +		return ctx;
> > +
> > +	rc = devm_add_action_or_reset(parent, (void (*)(void *))put_device,
> > +				      &ctx->ctrl_dev);
> > +	if (rc)
> > +		return ERR_PTR(rc);
> > +
> > +	return ctx;
> > +}
> > +
> > +static int sgx_dev_init(struct device *parent)
> > +{
> > +	struct sgx_dev_ctx *sgx_dev;
> > +	unsigned int eax;
> > +	unsigned int ebx;
> > +	unsigned int ecx;
> > +	unsigned int edx;
> > +	u64 attr_mask;
> > +	u64 xfrm_mask;
> > +	int ret;
> > +	int i;
> > +
> > +	cpuid_count(SGX_CPUID, 0, &eax, &ebx, &ecx, &edx);
> > +	sgx_misc_reserved_mask = ~ebx | SGX_MISC_RESERVED_MASK;
> > +	sgx_encl_size_max_64 = 1ULL << ((edx >> 8) & 0xFF);
> > +	sgx_encl_size_max_32 = 1ULL << (edx & 0xFF);
> > +
> > +	cpuid_count(SGX_CPUID, 1, &eax, &ebx, &ecx, &edx);
> > +
> > +	attr_mask = (((u64)ebx) << 32) + (u64)eax;
> > +	sgx_attributes_reserved_mask = ~attr_mask | SGX_ATTR_RESERVED_MASK;
> > +
> > +	if (boot_cpu_has(X86_FEATURE_OSXSAVE)) {
> > +		xfrm_mask = (((u64)edx) << 32) + (u64)ecx;
> > +
> > +		for (i = 2; i < 64; i++) {
> > +			cpuid_count(0x0D, i, &eax, &ebx, &ecx, &edx);
> > +			if ((1 << i) & xfrm_mask)
> > +				sgx_xsave_size_tbl[i] = eax + ebx;
> > +		}
> > +
> > +		sgx_xfrm_reserved_mask = ~xfrm_mask;
> > +	}
> > +
> > +	sgx_dev = sgxm_dev_ctx_alloc(parent);
> > +	if (IS_ERR(sgx_dev))
> > +		return PTR_ERR(sgx_dev);
> > +
> > +	sgx_encl_wq = alloc_workqueue("sgx-encl-wq",
> > +				      WQ_UNBOUND | WQ_FREEZABLE, 1);
> > +	if (!sgx_encl_wq)
> > +		return -ENOMEM;
> > +
> > +	ret = cdev_device_add(&sgx_dev->ctrl_cdev, &sgx_dev->ctrl_dev);
> > +	if (ret)
> > +		goto err_device_add;
> > +
> > +	return 0;
> > +
> > +err_device_add:
> > +	destroy_workqueue(sgx_encl_wq);
> > +	return ret;
> > +}
> > +
> > +static int sgx_drv_probe(struct platform_device *pdev)
> > +{
> > +	if (!sgx_enabled) {
> > +		pr_info("sgx: SGX is not enabled in the core\n");
> > +		return -ENODEV;
> > +	}
> > +
> > +	if (!boot_cpu_has(X86_FEATURE_SGX_LC)) {
> > +		pr_info("sgx: The public key MSRs are not writable\n");
> > +		return -ENODEV;
> > +	}
> > +
> > +	return sgx_dev_init(&pdev->dev);
> > +}
> > +
> > +static int sgx_drv_remove(struct platform_device *pdev)
> > +{
> > +	struct sgx_dev_ctx *ctx = dev_get_drvdata(&pdev->dev);
> > +
> > +	cdev_device_del(&ctx->ctrl_cdev, &ctx->ctrl_dev);
> > +	destroy_workqueue(sgx_encl_wq);
> > +
> > +	return 0;
> > +}
> > +
> > +#ifdef CONFIG_ACPI
> > +static struct acpi_device_id sgx_device_ids[] = {
> > +	{"INT0E0C", 0},
> > +	{"", 0},
> > +};
> > +MODULE_DEVICE_TABLE(acpi, sgx_device_ids);
> > +#endif
> > +
> > +static struct platform_driver sgx_drv = {
> > +	.probe = sgx_drv_probe,
> > +	.remove = sgx_drv_remove,
> > +	.driver = {
> > +		.name			= "sgx",
> > +		.acpi_match_table	= ACPI_PTR(sgx_device_ids),
> > +	},
> > +};
> 
> Utilizing the platform driver is unnecessary, adds complexity and IMO is
> flat out wrong given the current direction of implementing SGX as a
> full-blooded architectural feature.
> 
>   - All hardware information is readily available via CPUID
>   - arch_initcall hooks obviates the need for ACPI autoprobe
>   - EPC manager assumes it has full control over all EPC, i.e. EPC
>     sections are not managed as independent "devices"
>   - BIOS will enumerate a single ACPI entry regardless of the number
>     of EPC sections, i.e. the ACPI entry is *only* useful for probing
>   - Userspace driver matches the EPC device, but doesn't actually
>     "own" the EPC

It is for hotplugging. I don't really have strong opinions of this but
having a driver for uapi allows things like blacklisting sgx.

> 
> > +
> > +static int __init sgx_drv_subsys_init(void)
> > +{
> > +     int ret;
> > +
> > +     ret = bus_register(&sgx_bus_type);
> 
> Do we really need a bus/class?  Allocating a chrdev region also seems like
> overkill.  At this point there is exactly one SGX device, and while there
> is a pretty good chance we'll end up with a virtualization specific device
> for exposing EPC to guest, there's no guarantee said device will be SGX
> specific.  Using a dynamic miscdevice would eliminate a big chunk of code.

AFAIK misc is not recommended for any new drivers as it has suvere
limitations like not allowing to add non-racy sysfs attributes. Whatever
the solution is, lets not use misc.

> 
> > +	if (ret)
> > +		return ret;
> > +
> > +	ret = alloc_chrdev_region(&sgx_devt, 0, SGX_DRV_NR_DEVICES, "sgx");
> > +	if (ret < 0) {
> > +		bus_unregister(&sgx_bus_type);
> > +		return ret;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static void sgx_drv_subsys_exit(void)
> > +{
> > +	bus_unregister(&sgx_bus_type);
> > +	unregister_chrdev_region(sgx_devt, SGX_DRV_NR_DEVICES);
> > +}
> > +
> > +static int __init sgx_drv_init(void)
> > +{
> > +	int ret;
> > +
> > +	ret = sgx_drv_subsys_init();
> > +	if (ret)
> > +		return ret;
> > +
> > +	ret = platform_driver_register(&sgx_drv);
> > +	if (ret)
> > +		sgx_drv_subsys_exit();
> > +
> > +	return ret;
> > +}
> > +module_init(sgx_drv_init);
> > +
> > +static void __exit sgx_drv_exit(void)
> > +{
> > +	platform_driver_unregister(&sgx_drv);
> > +	sgx_drv_subsys_exit();
> > +}
> > +module_exit(sgx_drv_exit);
> > diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
> > new file mode 100644
> > index 000000000000..bd8bcd748976
> > --- /dev/null
> > +++ b/arch/x86/kernel/cpu/sgx/encl.c
> > @@ -0,0 +1,358 @@
> > +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
> > +// Copyright(c) 2016-18 Intel Corporation.
> > +
> > +#include <linux/mm.h>
> > +#include <linux/shmem_fs.h>
> > +#include <linux/suspend.h>
> > +#include <linux/sched/mm.h>
> > +#include "arch.h"
> > +#include "encl.h"
> > +#include "sgx.h"
> > +
> > +static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
> > +						unsigned long addr)
> > +{
> > +	struct sgx_encl_page *entry;
> > +
> > +	/* If process was forked, VMA is still there but vm_private_data is set
> > +	 * to NULL.
> > +	 */
> > +	if (!encl)
> > +		return ERR_PTR(-EFAULT);
> > +
> > +	if ((encl->flags & SGX_ENCL_DEAD) ||
> > +	    !(encl->flags & SGX_ENCL_INITIALIZED))
> > +		return ERR_PTR(-EFAULT);
> > +
> > +	entry = radix_tree_lookup(&encl->page_tree, addr >> PAGE_SHIFT);
> > +	if (!entry)
> > +		return ERR_PTR(-EFAULT);
> > +
> > +	/* Page is already resident in the EPC. */
> > +	if (entry->epc_page)
> > +		return entry;
> > +
> > +	return ERR_PTR(-EFAULT);
> > +}
> > +
> > +static struct sgx_encl_mm *sgx_encl_get_mm(struct sgx_encl *encl,
> > +					   struct mm_struct *mm)
> > +{
> > +	struct sgx_encl_mm *next_mm = NULL;
> > +	struct sgx_encl_mm *prev_mm = NULL;
> > +	int iter;
> > +
> > +	while (true) {
> > +		next_mm = sgx_encl_next_mm(encl, prev_mm, &iter);
> > +		if (prev_mm) {
> > +			mmdrop(prev_mm->mm);
> > +			kref_put(&prev_mm->refcount, sgx_encl_release_mm);
> > +		}
> > +		prev_mm = next_mm;
> > +
> > +		if (iter == SGX_ENCL_MM_ITER_DONE)
> > +			break;
> > +
> > +		if (iter == SGX_ENCL_MM_ITER_RESTART)
> > +			continue;
> > +
> > +		if (mm == next_mm->mm)
> > +			return next_mm;
> > +	}
> > +
> > +	return NULL;
> > +}
> > +
> > +static void sgx_vma_open(struct vm_area_struct *vma)
> > +{
> > +	struct sgx_encl *encl = vma->vm_private_data;
> > +	struct sgx_encl_mm *mm;
> > +
> > +	if (!encl)
> > +		return;
> > +
> > +	if (encl->flags & SGX_ENCL_DEAD)
> > +		goto out;
> > +
> > +	mm = sgx_encl_get_mm(encl, vma->vm_mm);
> > +	if (!mm) {
> > +		mm = kzalloc(sizeof(*mm), GFP_KERNEL);
> > +		if (!mm) {
> > +			encl->flags |= SGX_ENCL_DEAD;
> 
> Failure to allocate memory for one user of the enclave shouldn't kill
> the enclave, e.g. failing during fork() shouldn't kill the enclave in
> the the parent.  And marking an enclave dead without holding its lock
> is all kinds of bad.

This is almost non-existent occasion. Agree with the locking though..
And I'm open for other fallbacks but given the rarity I think the
current one in sustainable.

> 
> > +			goto out;
> > +		}
> > +
> > +		spin_lock(&encl->mm_lock);
> > +		mm->encl = encl;
> > +		mm->mm = vma->vm_mm;
> > +		list_add(&mm->list, &encl->mm_list);
> > +		kref_init(&mm->refcount);
> 
> Not that it truly matters, but list_add() is the only thing that needs
> to be protected with the spinlock, everything else can be done ahead of
> time.

True :-)

> 
> > +		spin_unlock(&encl->mm_lock);
> > +	} else {
> > +		mmdrop(mm->mm);
> > +	}
> > +
> > +out:
> > +	kref_get(&encl->refcount);
> > +}
> > +
> > +static void sgx_vma_close(struct vm_area_struct *vma)
> > +{
> > +	struct sgx_encl *encl = vma->vm_private_data;
> > +	struct sgx_encl_mm *mm;
> > +
> > +	if (!encl)
> > +		return;
> > +
> > +	mm = sgx_encl_get_mm(encl, vma->vm_mm);
> 
> Isn't this unnecessary?  sgx_vma_open() had to have been called on this
> VMA, otherwise we wouldn't be here.

Not in the case when allocation fails in vma_open.

> 
> > +	if (mm) {
> > +		mmdrop(mm->mm);
> > +		kref_put(&mm->refcount, sgx_encl_release_mm);
> > +
> > +		/* Release kref for the VMA. */
> > +		kref_put(&mm->refcount, sgx_encl_release_mm);
> > +	}
> > +
> > +	kref_put(&encl->refcount, sgx_encl_release);
> > +}
> > +
> > +static unsigned int sgx_vma_fault(struct vm_fault *vmf)
> > +{
> > +	unsigned long addr = (unsigned long)vmf->address;
> > +	struct vm_area_struct *vma = vmf->vma;
> > +	struct sgx_encl *encl = vma->vm_private_data;
> > +	struct sgx_encl_page *entry;
> > +	int ret = VM_FAULT_NOPAGE;
> > +	struct sgx_encl_mm *mm;
> > +	unsigned long pfn;
> > +
> > +	mm = sgx_encl_get_mm(encl, vma->vm_mm);
> > +	if (!mm)
> > +		return VM_FAULT_SIGBUS;
> > +
> > +	mmdrop(mm->mm);
> > +	kref_put(&mm->refcount, sgx_encl_release_mm);
> 
> Again, is retrieving the sgx_encl_mm necessary?  Isn't it impossible to
> get here unless the vma has an reference to the enclave?  I.e. it should
> be impossible to fault before open() or after close().
> 
> Dropping sgx_encl_get_mm() from both sgx_vma_close() and sgx_vma_fault()
> would mean sgx_vma_open() is the only user (as of this patch), i.e. can
> open code walking through the list and wouldn't need to confusing
> mmdrop().  It'd be even cleaner if we can use an RCU list.

These checks handle the case when allocation fails in vma_open.

> 
> > +
> > +	mutex_lock(&encl->lock);
> > +
> > +	entry = sgx_encl_load_page(encl, addr);
> > +	if (IS_ERR(entry)) {
> > +		if (unlikely(PTR_ERR(entry) != -EBUSY))
> > +			ret = VM_FAULT_SIGBUS;
> > +
> > +		goto out;
> > +	}
> > +
> > +	if (!follow_pfn(vma, addr, &pfn))
> > +		goto out;
> > +
> > +	ret = vmf_insert_pfn(vma, addr, PFN_DOWN(entry->epc_page->desc));
> > +	if (ret != VM_FAULT_NOPAGE) {
> > +		ret = VM_FAULT_SIGBUS;
> > +		goto out;
> > +	}
> > +
> > +out:
> > +	mutex_unlock(&encl->lock);
> > +	return ret;
> > +}
> > +
> > +const struct vm_operations_struct sgx_vm_ops = {
> > +	.close = sgx_vma_close,
> > +	.open = sgx_vma_open,
> > +	.fault = sgx_vma_fault,
> > +};
> > +EXPORT_SYMBOL_GPL(sgx_vm_ops);
> > +
> > +/**
> > + * sgx_encl_find - find an enclave
> > + * @mm:		mm struct of the current process
> > + * @addr:	address in the ELRANGE
> > + * @vma:	the resulting VMA
> > + *
> > + * Find an enclave identified by the given address. Give back a VMA that is
> > + * part of the enclave and located in that address. The VMA is given back if it
> > + * is a proper enclave VMA even if an &sgx_encl instance does not exist yet
> > + * (enclave creation has not been performed).
> > + *
> > + * Return:
> > + *   0 on success,
> > + *   -EINVAL if an enclave was not found,
> > + *   -ENOENT if the enclave has not been created yet
> > + */
> > +int sgx_encl_find(struct mm_struct *mm, unsigned long addr,
> > +		  struct vm_area_struct **vma)
> > +{
> > +	struct vm_area_struct *result;
> > +	struct sgx_encl *encl;
> > +
> > +	result = find_vma(mm, addr);
> > +	if (!result || result->vm_ops != &sgx_vm_ops || addr < result->vm_start)
> > +		return -EINVAL;
> > +
> > +	encl = result->vm_private_data;
> > +	*vma = result;
> > +
> > +	return encl ? 0 : -ENOENT;
> > +}
> > +EXPORT_SYMBOL_GPL(sgx_encl_find);
> > +
> > +/**
> > + * sgx_encl_destroy() - destroy enclave resources
> > + * @encl:	an &sgx_encl instance
> > + */
> > +void sgx_encl_destroy(struct sgx_encl *encl)
> > +{
> > +	struct sgx_encl_page *entry;
> > +	struct radix_tree_iter iter;
> > +	void **slot;
> > +
> > +	encl->flags |= SGX_ENCL_DEAD;
> > +
> > +	radix_tree_for_each_slot(slot, &encl->page_tree, &iter, 0) {
> > +		entry = *slot;
> > +		if (entry->epc_page) {
> > +			if (!__sgx_free_page(entry->epc_page)) {
> > +				encl->secs_child_cnt--;
> > +				entry->epc_page = NULL;
> > +
> > +			}
> > +
> > +			radix_tree_delete(&entry->encl->page_tree,
> > +					  PFN_DOWN(entry->desc));
> > +		}
> > +	}
> > +
> > +	if (!encl->secs_child_cnt && encl->secs.epc_page) {
> > +		sgx_free_page(encl->secs.epc_page);
> > +		encl->secs.epc_page = NULL;
> > +	}
> > +}
> > +EXPORT_SYMBOL_GPL(sgx_encl_destroy);
> > +
> > +/**
> > + * sgx_encl_release - Destroy an enclave instance
> > + * @kref:	address of a kref inside &sgx_encl
> > + *
> > + * Used together with kref_put(). Frees all the resources associated with the
> > + * enclave and the instance itself.
> > + */
> > +void sgx_encl_release(struct kref *ref)
> > +{
> > +	struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount);
> > +	struct sgx_encl_mm *mm;
> > +
> > +	if (encl->pm_notifier.notifier_call)
> > +		unregister_pm_notifier(&encl->pm_notifier);
> > +
> > +	sgx_encl_destroy(encl);
> > +
> > +	if (encl->backing)
> > +		fput(encl->backing);
> > +
> > +	/* If sgx_encl_create() fails, can be non-empty */
> > +	while (!list_empty(&encl->mm_list)) {
> > +		mm = list_first_entry(&encl->mm_list, struct sgx_encl_mm, list);
> 
> Any reason for while()+list_first_entry() instead of list_for_each_entry_safe()?

Just a preference.

> 
> > +		list_del(&mm->list);
> > +		kfree(mm);
> > +	}
> > +
> > +	kfree(encl);
> > +}
> > +EXPORT_SYMBOL_GPL(sgx_encl_release);
> > +
> > +/**
> > + * sgx_encl_get_index() - Convert a page descriptor to a page index
> > + * @encl:	an enclave
> > + * @page:	an enclave page
> > + *
> > + * Given an enclave page descriptor, convert it to a page index used to access
> > + * backing storage. The backing page for SECS is located after the enclave
> > + * pages.
> > + */
> > +pgoff_t sgx_encl_get_index(struct sgx_encl *encl, struct sgx_encl_page *page)
> > +{
> > +	if (!PFN_DOWN(page->desc))
> > +		return PFN_DOWN(encl->size);
> > +
> > +	return PFN_DOWN(page->desc - encl->base);
> > +}
> > +EXPORT_SYMBOL_GPL(sgx_encl_get_index);
> > +
> > +/**
> > + * sgx_encl_encl_get_backing_page() - Pin the backing page
> > + * @encl:	an enclave
> > + * @index:	page index
> > + *
> > + * Return: the pinned backing page
> > + */
> > +struct page *sgx_encl_get_backing_page(struct sgx_encl *encl, pgoff_t index)
> > +{
> > +	struct inode *inode = encl->backing->f_path.dentry->d_inode;
> > +	struct address_space *mapping = inode->i_mapping;
> > +	gfp_t gfpmask = mapping_gfp_mask(mapping);
> > +
> > +	return shmem_read_mapping_page_gfp(mapping, index, gfpmask);
> > +}
> > +EXPORT_SYMBOL_GPL(sgx_encl_get_backing_page);
> > +
> > +/**
> > + * sgx_encl_next_mm() - Iterate to the next mm
> > + * @encl:	an enclave
> > + * @mm:		an mm list entry
> > + * @iter:	iterator status
> > + *
> > + * Return: the enclave mm or NULL
> > + */
> > +struct sgx_encl_mm *sgx_encl_next_mm(struct sgx_encl *encl,
> > +				     struct sgx_encl_mm *mm, int *iter)
> > +{
> > +	struct list_head *entry;
> > +
> > +	WARN(!encl, "%s: encl is NULL", __func__);
> > +	WARN(!iter, "%s: iter is NULL", __func__);
> > +
> > +	spin_lock(&encl->mm_lock);
> > +
> > +	entry = mm ? mm->list.next : encl->mm_list.next;
> > +	WARN(!entry, "%s: entry is NULL", __func__);
> 
> I'm pretty sure we can simply use an RCU list.  Iteration is then simply
> list_for_each_entry_rcu() instead of this custom walker.  Peeking into the
> future, we don't need perfect accuracy for aging pages, e.g. working with
> a stale snapshot is perfectly ok.
> 
> Zapping PTEs does need 100% accuracy, but that will be guaranteed via the
> SGX_ENCL_PAGE_RECLAIMED flag.  Any mm added to the list after we start
> zapping will be rejected by the page fault handler, i.e. won't be able
> to reference the page being reclaimed.

I'm not rejecting RCUs but still think that lock based implementation
could be more easy to stabilize, and (S)RCU based implemenation could
be something to improved after landing the patch set.

> 
> > +
> > +	if (entry == &encl->mm_list) {
> > +		mm = NULL;
> > +		*iter = SGX_ENCL_MM_ITER_DONE;
> > +		goto out;
> > +	}
> > +
> > +	mm = list_entry(entry, struct sgx_encl_mm, list);
> > +
> > +	if (!kref_get_unless_zero(&mm->refcount)) {
> > +		*iter = SGX_ENCL_MM_ITER_RESTART;
> > +		mm = NULL;
> > +		goto out;
> > +	}
> > +
> > +	if (!atomic_add_unless(&mm->mm->mm_count, 1, 0)) {
> > +		kref_put(&mm->refcount, sgx_encl_release_mm);
> > +		mm = NULL;
> > +		*iter = SGX_ENCL_MM_ITER_RESTART;
> > +		goto out;
> > +	}
> > +
> > +	*iter = SGX_ENCL_MM_ITER_NEXT;
> > +
> > +out:
> > +	spin_unlock(&encl->mm_lock);
> > +	return mm;
> > +}
> > +
> > +void sgx_encl_release_mm(struct kref *ref)
> > +{
> > +	struct sgx_encl_mm *mm =
> > +		container_of(ref, struct sgx_encl_mm, refcount);
> > +
> > +	spin_lock(&mm->encl->mm_lock);
> > +	list_del(&mm->list);
> > +	spin_unlock(&mm->encl->mm_lock);
> > +
> > +	kfree(mm);
> > +}

/Jarkko