[v4,1/2] cgroup: svm: Add Encryption ID controller
diff mbox series

Message ID 20210108012846.4134815-2-vipinsh@google.com
State New, archived
Headers show
Series
  • cgroup: KVM: New Encryption IDs cgroup controller
Related show

Commit Message

Vipin Sharma Jan. 8, 2021, 1:28 a.m. UTC
Hardware memory encryption is available on multiple generic CPUs. For
example AMD has Secure Encrypted Virtualization (SEV) and SEV -
Encrypted State (SEV-ES).

These memory encryptions are useful in creating encrypted virtual
machines (VMs) and user space programs.

There are limited number of encryption IDs that can be used
simultaneously on a machine for encryption. This generates a need for
the system admin to track, limit, allocate resources, and optimally
schedule VMs and user workloads in the cloud infrastructure. Some
malicious programs can exhaust all of these resources on a host causing
starvation of other workloads.

Encryption ID controller allows control of these resources using
Cgroups.

Controller is enabled by CGROUP_ENCRYPTION_IDS config option.
Encryption controller provide 3 interface files for each encryption ID
type. For example, in SEV:

1. encrpytion_ids.sev.max
	Sets the maximum usage of SEV IDs in the cgroup.
2. encryption_ids.sev.current
	Current usage of SEV IDs in the cgroup and its children.
3. encryption_ids.sev.stat
	Shown only at the root cgroup. Displays total SEV IDs available
	on the platform and current usage count.

Other ID types can be easily added in the controller in the same way.

Signed-off-by: Vipin Sharma <vipinsh@google.com>
Reviewed-by: David Rientjes <rientjes@google.com>
Reviewed-by: Dionna Glaze <dionnaglaze@google.com>
---
 arch/x86/kvm/svm/sev.c                |  52 +++-
 include/linux/cgroup_subsys.h         |   4 +
 include/linux/encryption_ids_cgroup.h |  72 +++++
 include/linux/kvm_host.h              |   4 +
 init/Kconfig                          |  14 +
 kernel/cgroup/Makefile                |   1 +
 kernel/cgroup/encryption_ids.c        | 422 ++++++++++++++++++++++++++
 7 files changed, 557 insertions(+), 12 deletions(-)
 create mode 100644 include/linux/encryption_ids_cgroup.h
 create mode 100644 kernel/cgroup/encryption_ids.c

Comments

Brijesh Singh Jan. 13, 2021, 3:19 p.m. UTC | #1
On 1/7/2021 7:28 PM, Vipin Sharma wrote:
> Hardware memory encryption is available on multiple generic CPUs. For
> example AMD has Secure Encrypted Virtualization (SEV) and SEV -
> Encrypted State (SEV-ES).
>
> These memory encryptions are useful in creating encrypted virtual
> machines (VMs) and user space programs.
>
> There are limited number of encryption IDs that can be used
> simultaneously on a machine for encryption. This generates a need for
> the system admin to track, limit, allocate resources, and optimally
> schedule VMs and user workloads in the cloud infrastructure. Some
> malicious programs can exhaust all of these resources on a host causing
> starvation of other workloads.
>
> Encryption ID controller allows control of these resources using
> Cgroups.
>
> Controller is enabled by CGROUP_ENCRYPTION_IDS config option.
> Encryption controller provide 3 interface files for each encryption ID
> type. For example, in SEV:
>
> 1. encrpytion_ids.sev.max
> 	Sets the maximum usage of SEV IDs in the cgroup.
> 2. encryption_ids.sev.current
> 	Current usage of SEV IDs in the cgroup and its children.
> 3. encryption_ids.sev.stat
> 	Shown only at the root cgroup. Displays total SEV IDs available
> 	on the platform and current usage count.
>
> Other ID types can be easily added in the controller in the same way.
>
> Signed-off-by: Vipin Sharma <vipinsh@google.com>
> Reviewed-by: David Rientjes <rientjes@google.com>
> Reviewed-by: Dionna Glaze <dionnaglaze@google.com>


Acked-by: Brijesh Singh <brijesh.singh@amd.com>

> ---
>   arch/x86/kvm/svm/sev.c                |  52 +++-
>   include/linux/cgroup_subsys.h         |   4 +
>   include/linux/encryption_ids_cgroup.h |  72 +++++
>   include/linux/kvm_host.h              |   4 +
>   init/Kconfig                          |  14 +
>   kernel/cgroup/Makefile                |   1 +
>   kernel/cgroup/encryption_ids.c        | 422 ++++++++++++++++++++++++++
>   7 files changed, 557 insertions(+), 12 deletions(-)
>   create mode 100644 include/linux/encryption_ids_cgroup.h
>   create mode 100644 kernel/cgroup/encryption_ids.c
>
> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
> index 9858d5ae9ddd..1924ab2eaf11 100644
> --- a/arch/x86/kvm/svm/sev.c
> +++ b/arch/x86/kvm/svm/sev.c
> @@ -14,6 +14,7 @@
>   #include <linux/psp-sev.h>
>   #include <linux/pagemap.h>
>   #include <linux/swap.h>
> +#include <linux/encryption_ids_cgroup.h>
>   #include <linux/processor.h>
>   #include <linux/trace_events.h>
>   #include <asm/fpu/internal.h>
> @@ -86,10 +87,18 @@ static bool __sev_recycle_asids(int min_asid, int max_asid)
>   	return true;
>   }
>   
> -static int sev_asid_new(struct kvm_sev_info *sev)
> +static int sev_asid_new(struct kvm *kvm)
>   {
> -	int pos, min_asid, max_asid;
> +	int pos, min_asid, max_asid, ret;
> +	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>   	bool retry = true;
> +	enum encryption_id_type type;
> +
> +	type = sev->es_active ? ENCRYPTION_ID_SEV_ES : ENCRYPTION_ID_SEV;
> +
> +	ret = enc_id_cg_try_charge(kvm, type, 1);
> +	if (ret)
> +		return ret;
>   
>   	mutex_lock(&sev_bitmap_lock);
>   
> @@ -107,7 +116,8 @@ static int sev_asid_new(struct kvm_sev_info *sev)
>   			goto again;
>   		}
>   		mutex_unlock(&sev_bitmap_lock);
> -		return -EBUSY;
> +		ret = -EBUSY;
> +		goto e_uncharge;
>   	}
>   
>   	__set_bit(pos, sev_asid_bitmap);
> @@ -115,6 +125,9 @@ static int sev_asid_new(struct kvm_sev_info *sev)
>   	mutex_unlock(&sev_bitmap_lock);
>   
>   	return pos + 1;
> +e_uncharge:
> +	enc_id_cg_uncharge(kvm, type, 1);
> +	return ret;
>   }
>   
>   static int sev_get_asid(struct kvm *kvm)
> @@ -124,14 +137,16 @@ static int sev_get_asid(struct kvm *kvm)
>   	return sev->asid;
>   }
>   
> -static void sev_asid_free(int asid)
> +static void sev_asid_free(struct kvm *kvm)
>   {
>   	struct svm_cpu_data *sd;
>   	int cpu, pos;
> +	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> +	enum encryption_id_type type;
>   
>   	mutex_lock(&sev_bitmap_lock);
>   
> -	pos = asid - 1;
> +	pos = sev->asid - 1;
>   	__set_bit(pos, sev_reclaim_asid_bitmap);
>   
>   	for_each_possible_cpu(cpu) {
> @@ -140,6 +155,9 @@ static void sev_asid_free(int asid)
>   	}
>   
>   	mutex_unlock(&sev_bitmap_lock);
> +
> +	type = sev->es_active ? ENCRYPTION_ID_SEV_ES : ENCRYPTION_ID_SEV;
> +	enc_id_cg_uncharge(kvm, type, 1);
>   }
>   
>   static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
> @@ -184,22 +202,22 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
>   	if (unlikely(sev->active))
>   		return ret;
>   
> -	asid = sev_asid_new(sev);
> +	asid = sev_asid_new(kvm);
>   	if (asid < 0)
>   		return ret;
> +	sev->asid = asid;
>   
>   	ret = sev_platform_init(&argp->error);
>   	if (ret)
>   		goto e_free;
>   
>   	sev->active = true;
> -	sev->asid = asid;
>   	INIT_LIST_HEAD(&sev->regions_list);
>   
>   	return 0;
>   
>   e_free:
> -	sev_asid_free(asid);
> +	sev_asid_free(kvm);
>   	return ret;
>   }
>   
> @@ -1240,12 +1258,12 @@ void sev_vm_destroy(struct kvm *kvm)
>   	mutex_unlock(&kvm->lock);
>   
>   	sev_unbind_asid(kvm, sev->handle);
> -	sev_asid_free(sev->asid);
> +	sev_asid_free(kvm);
>   }
>   
>   void __init sev_hardware_setup(void)
>   {
> -	unsigned int eax, ebx, ecx, edx;
> +	unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count;
>   	bool sev_es_supported = false;
>   	bool sev_supported = false;
>   
> @@ -1277,7 +1295,11 @@ void __init sev_hardware_setup(void)
>   	if (!sev_reclaim_asid_bitmap)
>   		goto out;
>   
> -	pr_info("SEV supported: %u ASIDs\n", max_sev_asid - min_sev_asid + 1);
> +	sev_asid_count = max_sev_asid - min_sev_asid + 1;
> +	if (enc_id_cg_set_capacity(ENCRYPTION_ID_SEV, sev_asid_count))
> +		goto out;
> +
> +	pr_info("SEV supported: %u ASIDs\n", sev_asid_count);
>   	sev_supported = true;
>   
>   	/* SEV-ES support requested? */
> @@ -1292,7 +1314,11 @@ void __init sev_hardware_setup(void)
>   	if (min_sev_asid == 1)
>   		goto out;
>   
> -	pr_info("SEV-ES supported: %u ASIDs\n", min_sev_asid - 1);
> +	sev_es_asid_count = min_sev_asid - 1;
> +	if (enc_id_cg_set_capacity(ENCRYPTION_ID_SEV_ES, sev_es_asid_count))
> +		goto out;
> +
> +	pr_info("SEV-ES supported: %u ASIDs\n", sev_es_asid_count);
>   	sev_es_supported = true;
>   
>   out:
> @@ -1307,6 +1333,8 @@ void sev_hardware_teardown(void)
>   
>   	bitmap_free(sev_asid_bitmap);
>   	bitmap_free(sev_reclaim_asid_bitmap);
> +	enc_id_cg_set_capacity(ENCRYPTION_ID_SEV, 0);
> +	enc_id_cg_set_capacity(ENCRYPTION_ID_SEV_ES, 0);
>   
>   	sev_flush_asids();
>   }
> diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
> index acb77dcff3b4..83754f58c05e 100644
> --- a/include/linux/cgroup_subsys.h
> +++ b/include/linux/cgroup_subsys.h
> @@ -61,6 +61,10 @@ SUBSYS(pids)
>   SUBSYS(rdma)
>   #endif
>   
> +#if IS_ENABLED(CONFIG_CGROUP_ENCRYPTION_IDS)
> +SUBSYS(encryption_ids)
> +#endif
> +
>   /*
>    * The following subsystems are not supported on the default hierarchy.
>    */
> diff --git a/include/linux/encryption_ids_cgroup.h b/include/linux/encryption_ids_cgroup.h
> new file mode 100644
> index 000000000000..af428a4beb28
> --- /dev/null
> +++ b/include/linux/encryption_ids_cgroup.h
> @@ -0,0 +1,72 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Encryption IDs cgroup controller.
> + *
> + * Copyright 2020 Google LLC
> + * Author: Vipin Sharma <vipinsh@google.com>
> + */
> +#ifndef _ENCRYPTION_IDS_CGROUP_H_
> +#define _ENCRYPTION_IDS_CGROUP_H_
> +
> +#include <linux/cgroup-defs.h>
> +#include <linux/kvm_types.h>
> +
> +/**
> + * Types of encryption IDs supported by the host.
> + */
> +enum encryption_id_type {
> +#ifdef CONFIG_KVM_AMD_SEV
> +	ENCRYPTION_ID_SEV,
> +	ENCRYPTION_ID_SEV_ES,
> +#endif
> +	ENCRYPTION_ID_TYPES
> +};
> +
> +#ifdef CONFIG_CGROUP_ENCRYPTION_IDS
> +
> +/**
> + * struct encryption_id_res: Per cgroup per encryption ID resource
> + * @max: Maximum count of encryption ID that can be used.
> + * @usage: Current usage of encryption ID in the cgroup.
> + */
> +struct encryption_id_res {
> +	unsigned int max;
> +	unsigned int usage;
> +};
> +
> +/**
> + * struct encryption_id_cgroup - Encryption IDs controller's cgroup structure.
> + * @css: cgroup subsys state object.
> + * @ids: Array of encryption IDs resource usage in the cgroup.
> + */
> +struct encryption_id_cgroup {
> +	struct cgroup_subsys_state css;
> +	struct encryption_id_res res[ENCRYPTION_ID_TYPES];
> +};
> +
> +int enc_id_cg_set_capacity(enum encryption_id_type type, unsigned int capacity);
> +int enc_id_cg_try_charge(struct kvm *kvm, enum encryption_id_type type,
> +			 unsigned int amount);
> +void enc_id_cg_uncharge(struct kvm *kvm, enum encryption_id_type type,
> +			unsigned int amount);
> +#else
> +static inline int enc_id_cg_set_capacity(enum encryption_id_type type,
> +					 unsigned int capacity)
> +{
> +	return 0;
> +}
> +
> +static inline int enc_id_cg_try_charge(struct kvm *kvm,
> +				       enum encryption_id_type type,
> +				       unsigned int amount)
> +{
> +	return 0;
> +}
> +
> +static inline void enc_id_cg_uncharge(struct kvm *kvm,
> +				      enum encryption_id_type type,
> +				      unsigned int amount)
> +{
> +}
> +#endif /* CONFIG_CGROUP_ENCRYPTION_IDS */
> +#endif /* _ENCRYPTION_CGROUP_H_ */
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index f3b1013fb22c..ae9fde0d4267 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -27,6 +27,7 @@
>   #include <linux/refcount.h>
>   #include <linux/nospec.h>
>   #include <asm/signal.h>
> +#include <linux/encryption_ids_cgroup.h>
>   
>   #include <linux/kvm.h>
>   #include <linux/kvm_para.h>
> @@ -513,6 +514,9 @@ struct kvm {
>   	pid_t userspace_pid;
>   	unsigned int max_halt_poll_ns;
>   	u32 dirty_ring_size;
> +#ifdef CONFIG_CGROUP_ENCRYPTION_IDS
> +	struct encryption_id_cgroup *enc_id_cg;
> +#endif
>   };
>   
>   #define kvm_err(fmt, ...) \
> diff --git a/init/Kconfig b/init/Kconfig
> index b77c60f8b963..6c0bd0e7c08d 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1106,6 +1106,20 @@ config CGROUP_BPF
>   	  BPF_CGROUP_INET_INGRESS will be executed on the ingress path of
>   	  inet sockets.
>   
> +config CGROUP_ENCRYPTION_IDS
> +	bool "Encryption IDs controller"
> +	depends on KVM_AMD_SEV
> +	default n
> +	help
> +	  Provides a controller for CPU encryption IDs on a host.
> +
> +	  Some platforms have limited number of encryption IDs which can be
> +	  used simultaneously, e.g., AMD's Secure Encrypted Virtualization
> +	  (SEV). This controller tracks and limits the total number of IDs used
> +	  by processes attached to a cgroup hierarchy. For more information,
> +	  please check Encryption IDs section in
> +	  /Documentation/admin-guide/cgroup-v2.rst.
> +
>   config CGROUP_DEBUG
>   	bool "Debug controller"
>   	default n
> diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
> index 5d7a76bfbbb7..6c19208dfb7f 100644
> --- a/kernel/cgroup/Makefile
> +++ b/kernel/cgroup/Makefile
> @@ -5,4 +5,5 @@ obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o
>   obj-$(CONFIG_CGROUP_PIDS) += pids.o
>   obj-$(CONFIG_CGROUP_RDMA) += rdma.o
>   obj-$(CONFIG_CPUSETS) += cpuset.o
> +obj-$(CONFIG_CGROUP_ENCRYPTION_IDS) += encryption_ids.o
>   obj-$(CONFIG_CGROUP_DEBUG) += debug.o
> diff --git a/kernel/cgroup/encryption_ids.c b/kernel/cgroup/encryption_ids.c
> new file mode 100644
> index 000000000000..7cd7d3951bb9
> --- /dev/null
> +++ b/kernel/cgroup/encryption_ids.c
> @@ -0,0 +1,422 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Encryption IDs cgroup controller
> + *
> + * Copyright 2020 Google LLC
> + * Author: Vipin Sharma <vipinsh@google.com>
> + */
> +
> +#include <linux/limits.h>
> +#include <linux/cgroup.h>
> +#include <linux/errno.h>
> +#include <linux/spinlock.h>
> +#include <linux/lockdep.h>
> +#include <linux/slab.h>
> +#include <linux/kvm_host.h>
> +#include <linux/encryption_ids_cgroup.h>
> +
> +#define MAX_STR "max"
> +#define MAX_NUM UINT_MAX
> +
> +/* Root Encryption ID cgroup */
> +static struct encryption_id_cgroup root_cg;
> +
> +/* Lock for tracking and updating encryption ID resources. */
> +static DEFINE_SPINLOCK(enc_id_cg_lock);
> +
> +/* Encryption ID types capacity. */
> +static unsigned int enc_id_capacity[ENCRYPTION_ID_TYPES];
> +
> +/**
> + * css_enc() - Get encryption ID cgroup from the css.
> + * @css: cgroup subsys state object.
> + *
> + * Context: Any context.
> + * Return:
> + * * %NULL - If @css is null.
> + * * struct encryption_id_cgroup* - Encryption ID cgroup pointer of the passed
> + *				    css.
> + */
> +static struct encryption_id_cgroup *css_enc(struct cgroup_subsys_state *css)
> +{
> +	return css ? container_of(css, struct encryption_id_cgroup, css) : NULL;
> +}
> +
> +/**
> + * parent_enc() - Get the parent of the passed encryption ID cgroup.
> + * @cgroup: cgroup whose parent needs to be fetched.
> + *
> + * Context: Any context.
> + * Return:
> + * * struct encryption_id_cgroup* - Parent of the @cgroup.
> + * * %NULL - If @cgroup is null or the passed cgroup does not have a parent.
> + */
> +static struct encryption_id_cgroup *
> +parent_enc(struct encryption_id_cgroup *cgroup)
> +{
> +	return cgroup ? css_enc(cgroup->css.parent) : NULL;
> +}
> +
> +/**
> + * valid_type() - Check if @type is valid or not.
> + * @type: encryption ID type.
> + *
> + * Context: Any context.
> + * Return:
> + * * true - If valid type.
> + * * false - If not valid type.
> + */
> +static inline bool valid_type(enum encryption_id_type type)
> +{
> +	return type >= 0 && type < ENCRYPTION_ID_TYPES;
> +}
> +
> +/**
> + * enc_id_cg_uncharge_hierarchy() - Uncharge the enryption ID cgroup hierarchy.
> + * @start_cg: Starting cgroup.
> + * @stop_cg: cgroup at which uncharge stops.
> + * @type: type of encryption ID to uncharge.
> + * @amount: Charge amount.
> + *
> + * Uncharge the cgroup tree from the given start cgroup to the stop cgroup.
> + *
> + * Context: Any context. Expects enc_id_cg_lock to be held by the caller.
> + */
> +static void enc_id_cg_uncharge_hierarchy(struct encryption_id_cgroup *start_cg,
> +					 struct encryption_id_cgroup *stop_cg,
> +					 enum encryption_id_type type,
> +					 unsigned int amount)
> +{
> +	struct encryption_id_cgroup *i;
> +
> +	lockdep_assert_held(&enc_id_cg_lock);
> +
> +	for (i = start_cg; i != stop_cg; i = parent_enc(i)) {
> +		WARN_ON_ONCE(i->res[type].usage < amount);
> +		i->res[type].usage -= amount;
> +	}
> +	css_put(&start_cg->css);
> +}
> +
> +/**
> + * enc_id_cg_set_capacity() - Set the capacity of the encryption ID.
> + * @type: Type of the encryption ID.
> + * @capacity: Supported capacity of the encryption ID on the host.
> + *
> + * If capacity is 0 then the charging a cgroup fails for the encryption ID.
> + *
> + * Context: Any context. Takes and releases enc_id_cg_lock.
> + * Return:
> + * * %0 - Successfully registered the capacity.
> + * * %-EINVAL - If @type is invalid.
> + * * %-EBUSY - If current usage is more than the capacity.
> + */
> +int enc_id_cg_set_capacity(enum encryption_id_type type, unsigned int capacity)
> +{
> +	int ret = 0;
> +	unsigned long flags;
> +
> +	if (!valid_type(type))
> +		return -EINVAL;
> +
> +	spin_lock_irqsave(&enc_id_cg_lock, flags);
> +
> +	if (WARN_ON_ONCE(root_cg.res[type].usage > capacity))
> +		ret = -EBUSY;
> +	else
> +		enc_id_capacity[type] = capacity;
> +
> +	spin_unlock_irqrestore(&enc_id_cg_lock, flags);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL(enc_id_cg_set_capacity);
> +
> +/**
> + * enc_id_cg_try_charge() - Try charging encryption ID cgroup.
> + * @kvm: kvm to store charged cgroup.
> + * @type: Encryption ID type to charge.
> + * @amount: Amount to charge.
> + *
> + * Charge @amount to the cgroup to which the current task belongs to. Charged
> + * cgroup will be pointed by @cg. Caller must use the same cgroup during
> + * uncharge call.
> + *
> + * Context: Any context. Takes and releases enc_id_cg_lock.
> + * Return:
> + * * %0 - If successfully charged.
> + * * -EINVAL - If @type is invalid or encryption ID has 0 capacity.
> + * * -EBUSY - If max limit will be crossed or total usage will be more than the
> + *	      capacity.
> + */
> +int enc_id_cg_try_charge(struct kvm *kvm, enum encryption_id_type type,
> +			 unsigned int amount)
> +{
> +	struct encryption_id_cgroup *task_cg, *i;
> +	struct encryption_id_res *id_res;
> +	int ret;
> +	unsigned int new_usage;
> +	unsigned long flags;
> +
> +	if (!valid_type(type) || !kvm)
> +		return -EINVAL;
> +
> +	if (!amount)
> +		return 0;
> +
> +	spin_lock_irqsave(&enc_id_cg_lock, flags);
> +
> +	if (!enc_id_capacity[type]) {
> +		ret = -EINVAL;
> +		goto err_capacity;
> +	}
> +
> +	task_cg = css_enc(task_get_css(current, encryption_ids_cgrp_id));
> +
> +	for (i = task_cg; i; i = parent_enc(i)) {
> +		id_res = &i->res[type];
> +
> +		new_usage = id_res->usage + amount;
> +		WARN_ON_ONCE(new_usage < id_res->usage);
> +
> +		if (new_usage > id_res->max ||
> +		    new_usage > enc_id_capacity[type]) {
> +			ret = -EBUSY;
> +			goto err_charge;
> +		}
> +
> +		id_res->usage = new_usage;
> +	}
> +
> +	kvm->enc_id_cg = task_cg;
> +	spin_unlock_irqrestore(&enc_id_cg_lock, flags);
> +	return 0;
> +
> +err_charge:
> +	enc_id_cg_uncharge_hierarchy(task_cg, i, type, amount);
> +err_capacity:
> +	spin_unlock_irqrestore(&enc_id_cg_lock, flags);
> +	return ret;
> +}
> +EXPORT_SYMBOL(enc_id_cg_try_charge);
> +
> +/**
> + * enc_id_cg_uncharge() - Uncharge the encryption ID cgroup.
> + * @kvm: kvm containing the corresponding encryption ID cgroup.
> + * @type: Encryption ID which was charged.
> + * @amount: Charged amount.
> + *
> + * Context: Any context. Takes and releases enc_id_cg_lock.
> + */
> +void enc_id_cg_uncharge(struct kvm *kvm, enum encryption_id_type type,
> +			unsigned int amount)
> +{
> +	unsigned long flags;
> +
> +	if (!amount)
> +		return;
> +	if (!valid_type(type))
> +		return;
> +	if (!kvm || WARN_ON_ONCE(!(kvm->enc_id_cg)))
> +		return;
> +
> +	spin_lock_irqsave(&enc_id_cg_lock, flags);
> +	enc_id_cg_uncharge_hierarchy(kvm->enc_id_cg, NULL, type, amount);
> +	spin_unlock_irqrestore(&enc_id_cg_lock, flags);
> +
> +	kvm->enc_id_cg = NULL;
> +}
> +EXPORT_SYMBOL(enc_id_cg_uncharge);
> +
> +/**
> + * enc_id_cg_max_show() - Show encryption ID cgroup max limit.
> + * @sf: Interface file
> + * @v: Arguments passed
> + *
> + * Uses cft->private value to determine for which enryption ID type results be
> + * shown.
> + *
> + * Context: Any context.
> + * Return: 0 to denote successful print.
> + */
> +static int enc_id_cg_max_show(struct seq_file *sf, void *v)
> +{
> +	struct encryption_id_cgroup *cg = css_enc(seq_css(sf));
> +	enum encryption_id_type type = seq_cft(sf)->private;
> +
> +	if (cg->res[type].max == MAX_NUM)
> +		seq_printf(sf, "%s\n", MAX_STR);
> +	else
> +		seq_printf(sf, "%u\n", cg->res[type].max);
> +
> +	return 0;
> +}
> +
> +/**
> + * enc_id_cg_max_write() - Update the maximum limit of the cgroup.
> + * @of: Handler for the file.
> + * @buf: Data from the user. It should be either "max", 0, or a positive
> + *	 integer.
> + * @nbytes: Number of bytes of the data.
> + * @off: Offset in the file.
> + *
> + * Uses cft->private value to determine for which enryption ID type results be
> + * shown.
> + *
> + * Context: Any context. Takes and releases enc_id_cg_lock.
> + * Return:
> + * * >= 0 - Number of bytes processed in the input.
> + * * -EINVAL - If buf is not valid.
> + * * -ERANGE - If number is bigger than unsigned int capacity.
> + * * -EBUSY - If usage can become more than max limit.
> + */
> +static ssize_t enc_id_cg_max_write(struct kernfs_open_file *of, char *buf,
> +				   size_t nbytes, loff_t off)
> +{
> +	struct encryption_id_cgroup *cg;
> +	unsigned int max;
> +	int ret = 0;
> +	enum encryption_id_type type;
> +
> +	buf = strstrip(buf);
> +	if (!strcmp(MAX_STR, buf)) {
> +		max = UINT_MAX;
> +	} else {
> +		ret = kstrtouint(buf, 0, &max);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	cg = css_enc(of_css(of));
> +	type = of_cft(of)->private;
> +	cg->res[type].max = max;
> +
> +	return nbytes;
> +}
> +
> +/**
> + * enc_id_cg_current_read() - Show current usage of the encryption ID.
> + * @css: css pointer of the cgroup.
> + * @cft: cft pointer of the cgroup.
> + *
> + * Uses cft->private value to determine for which enryption ID type results be
> + * shown.
> + *
> + * Context: Any context.
> + * Return: 0 to denote successful print.
> + */
> +static u64 enc_id_cg_current_read(struct cgroup_subsys_state *css,
> +				  struct cftype *cft)
> +{
> +	struct encryption_id_cgroup *cg = css_enc(css);
> +	enum encryption_id_type type = cft->private;
> +
> +	return cg->res[type].usage;
> +}
> +
> +/**
> + * enc_id_cg_stat_show() - Show the current stat of the cgroup.
> + * @sf: Interface file
> + * @v: Arguments passed
> + *
> + * Shows the total capacity of the encryption ID and its current usage.
> + * Only shows in root cgroup directory.
> + *
> + * Uses cft->private value to determine for which enryption ID type results be
> + * shown.
> + *
> + * Context: Any context. Takes and releases enc_id_cg_lock.
> + * Return: 0 to denote successful print.
> + */
> +static int enc_id_cg_stat_show(struct seq_file *sf, void *v)
> +{
> +	unsigned long flags;
> +	enum encryption_id_type type = seq_cft(sf)->private;
> +
> +	spin_lock_irqsave(&enc_id_cg_lock, flags);
> +
> +	seq_printf(sf, "total %u\n", enc_id_capacity[type]);
> +	seq_printf(sf, "used %u\n", root_cg.res[type].usage);
> +
> +	spin_unlock_irqrestore(&enc_id_cg_lock, flags);
> +	return 0;
> +}
> +
> +/* Each encryption ID type has these cgroup files. */
> +#define ENC_ID_CGROUP_FILES(id_name, id_type)		\
> +	[(id_type) * 3] = {				\
> +		.name = id_name ".max",			\
> +		.write = enc_id_cg_max_write,		\
> +		.seq_show = enc_id_cg_max_show,		\
> +		.flags = CFTYPE_NOT_ON_ROOT,		\
> +		.private = id_type,			\
> +	},						\
> +	[((id_type) * 3) + 1] = {			\
> +		.name = id_name ".current",		\
> +		.read_u64 = enc_id_cg_current_read,	\
> +		.flags = CFTYPE_NOT_ON_ROOT,		\
> +		.private = id_type,			\
> +	},						\
> +	[((id_type) * 3) + 2] = {			\
> +		.name = id_name ".stat",		\
> +		.seq_show = enc_id_cg_stat_show,	\
> +		.flags = CFTYPE_ONLY_ON_ROOT,		\
> +		.private = id_type,			\
> +	}
> +
> +/* Encryption ID cgroup interface files */
> +static struct cftype enc_id_cg_files[] = {
> +#ifdef CONFIG_KVM_AMD_SEV
> +	ENC_ID_CGROUP_FILES("sev", ENCRYPTION_ID_SEV),
> +	ENC_ID_CGROUP_FILES("sev_es", ENCRYPTION_ID_SEV_ES),
> +#endif
> +	{}
> +};
> +
> +/**
> + * enc_id_cg_alloc() - Allocate encryption ID cgroup.
> + * @parent_css: Parent cgroup.
> + *
> + * Context: Process context.
> + * Return:
> + * * struct cgroup_subsys_state* - css of the allocated cgroup.
> + * * ERR_PTR(-ENOMEM) - No memory available to allocate.
> + */
> +static struct cgroup_subsys_state *
> +enc_id_cg_alloc(struct cgroup_subsys_state *parent_css)
> +{
> +	enum encryption_id_type i;
> +	struct encryption_id_cgroup *cg;
> +
> +	if (!parent_css) {
> +		cg = &root_cg;
> +	} else {
> +		cg = kzalloc(sizeof(*cg), GFP_KERNEL);
> +		if (!cg)
> +			return ERR_PTR(-ENOMEM);
> +	}
> +
> +	for (i = 0; i < ENCRYPTION_ID_TYPES; i++)
> +		cg->res[i].max = MAX_NUM;
> +
> +	return &cg->css;
> +}
> +
> +/**
> + * enc_id_cg_free() - Free the encryption ID cgroup.
> + * @css: cgroup subsys object.
> + *
> + * Context: Any context.
> + */
> +static void enc_id_cg_free(struct cgroup_subsys_state *css)
> +{
> +	kfree(css_enc(css));
> +}
> +
> +/* Cgroup controller callbacks */
> +struct cgroup_subsys encryption_ids_cgrp_subsys = {
> +	.css_alloc = enc_id_cg_alloc,
> +	.css_free = enc_id_cg_free,
> +	.legacy_cftypes = enc_id_cg_files,
> +	.dfl_cftypes = enc_id_cg_files,
> +};
Tejun Heo Jan. 15, 2021, 8:59 p.m. UTC | #2
Hello,

On Thu, Jan 07, 2021 at 05:28:45PM -0800, Vipin Sharma wrote:
> 1. encrpytion_ids.sev.max
> 	Sets the maximum usage of SEV IDs in the cgroup.
> 2. encryption_ids.sev.current
> 	Current usage of SEV IDs in the cgroup and its children.
> 3. encryption_ids.sev.stat
> 	Shown only at the root cgroup. Displays total SEV IDs available
> 	on the platform and current usage count.

Sorry, should have raised these earlier:

* Can we shorten the name to encids?

* Why is .sev a separate namespace? Isn't the controller supposed to cover
  encryption ids across different implementations? It's not like multiple
  types of IDs can be in use on the same machine, right?

> Other ID types can be easily added in the controller in the same way.

I'm not sure this is necessarily a good thing.

> +/**
> + * enc_id_cg_uncharge_hierarchy() - Uncharge the enryption ID cgroup hierarchy.
> + * @start_cg: Starting cgroup.
> + * @stop_cg: cgroup at which uncharge stops.
> + * @type: type of encryption ID to uncharge.
> + * @amount: Charge amount.
> + *
> + * Uncharge the cgroup tree from the given start cgroup to the stop cgroup.
> + *
> + * Context: Any context. Expects enc_id_cg_lock to be held by the caller.
> + */
> +static void enc_id_cg_uncharge_hierarchy(struct encryption_id_cgroup *start_cg,
> +					 struct encryption_id_cgroup *stop_cg,
> +					 enum encryption_id_type type,
> +					 unsigned int amount)
> +{
> +	struct encryption_id_cgroup *i;
> +
> +	lockdep_assert_held(&enc_id_cg_lock);
> +
> +	for (i = start_cg; i != stop_cg; i = parent_enc(i)) {
> +		WARN_ON_ONCE(i->res[type].usage < amount);
> +		i->res[type].usage -= amount;
> +	}
> +	css_put(&start_cg->css);

I'm curious whether this is necessary given that a css can't be destroyed
while tasks are attached. Are there cases where this wouldn't hold true? If
so, it'd be great to have some comments on how that can happen.

> +/**
> + * enc_id_cg_max_write() - Update the maximum limit of the cgroup.
> + * @of: Handler for the file.
> + * @buf: Data from the user. It should be either "max", 0, or a positive
> + *	 integer.
> + * @nbytes: Number of bytes of the data.
> + * @off: Offset in the file.
> + *
> + * Uses cft->private value to determine for which enryption ID type results be
> + * shown.
> + *
> + * Context: Any context. Takes and releases enc_id_cg_lock.
> + * Return:
> + * * >= 0 - Number of bytes processed in the input.
> + * * -EINVAL - If buf is not valid.
> + * * -ERANGE - If number is bigger than unsigned int capacity.
> + * * -EBUSY - If usage can become more than max limit.

The aboves are stale, right?

> +static int enc_id_cg_stat_show(struct seq_file *sf, void *v)
> +{
> +	unsigned long flags;
> +	enum encryption_id_type type = seq_cft(sf)->private;
> +
> +	spin_lock_irqsave(&enc_id_cg_lock, flags);
> +
> +	seq_printf(sf, "total %u\n", enc_id_capacity[type]);
> +	seq_printf(sf, "used %u\n", root_cg.res[type].usage);

Dup with .current and no need to show total on every cgroup, right?

Thanks.
Vipin Sharma Jan. 15, 2021, 10:18 p.m. UTC | #3
On Fri, Jan 15, 2021 at 03:59:25PM -0500, Tejun Heo wrote:
> Hello,
> 
> On Thu, Jan 07, 2021 at 05:28:45PM -0800, Vipin Sharma wrote:
> > 1. encrpytion_ids.sev.max
> > 	Sets the maximum usage of SEV IDs in the cgroup.
> > 2. encryption_ids.sev.current
> > 	Current usage of SEV IDs in the cgroup and its children.
> > 3. encryption_ids.sev.stat
> > 	Shown only at the root cgroup. Displays total SEV IDs available
> > 	on the platform and current usage count.
> 
> Sorry, should have raised these earlier:
> 
> * Can we shorten the name to encids?

Sure.

> 
> * Why is .sev a separate namespace? Isn't the controller supposed to cover
>   encryption ids across different implementations? It's not like multiple
>   types of IDs can be in use on the same machine, right?
> 

On AMD platform we have two types SEV and SEV-ES which can exists
simultaneously and they have their own quota.

> > Other ID types can be easily added in the controller in the same way.
> 
> I'm not sure this is necessarily a good thing.

This is to just say that when Intel and PowerPC changes are ready it
won't be difficult for them to add their controller.

> 
> > +/**
> > + * enc_id_cg_uncharge_hierarchy() - Uncharge the enryption ID cgroup hierarchy.
> > + * @start_cg: Starting cgroup.
> > + * @stop_cg: cgroup at which uncharge stops.
> > + * @type: type of encryption ID to uncharge.
> > + * @amount: Charge amount.
> > + *
> > + * Uncharge the cgroup tree from the given start cgroup to the stop cgroup.
> > + *
> > + * Context: Any context. Expects enc_id_cg_lock to be held by the caller.
> > + */
> > +static void enc_id_cg_uncharge_hierarchy(struct encryption_id_cgroup *start_cg,
> > +					 struct encryption_id_cgroup *stop_cg,
> > +					 enum encryption_id_type type,
> > +					 unsigned int amount)
> > +{
> > +	struct encryption_id_cgroup *i;
> > +
> > +	lockdep_assert_held(&enc_id_cg_lock);
> > +
> > +	for (i = start_cg; i != stop_cg; i = parent_enc(i)) {
> > +		WARN_ON_ONCE(i->res[type].usage < amount);
> > +		i->res[type].usage -= amount;
> > +	}
> > +	css_put(&start_cg->css);
> 
> I'm curious whether this is necessary given that a css can't be destroyed
> while tasks are attached. Are there cases where this wouldn't hold true? If
> so, it'd be great to have some comments on how that can happen.

We are not moving charges when a task moves out. This can lead us to the
cases where all of the tasks in the cgroup have moved out but it
still has charges. In that scenarios cgroup can be deleted. Taking a
reference will make sure cgroup is atleast present internally.

Also, struct encryption_ic_cgroup has a reference to the cgroup which is
used during uncharge call to correctly identify from which cgroup charge
should be deducted.

> 
> > +/**
> > + * enc_id_cg_max_write() - Update the maximum limit of the cgroup.
> > + * @of: Handler for the file.
> > + * @buf: Data from the user. It should be either "max", 0, or a positive
> > + *	 integer.
> > + * @nbytes: Number of bytes of the data.
> > + * @off: Offset in the file.
> > + *
> > + * Uses cft->private value to determine for which enryption ID type results be
> > + * shown.
> > + *
> > + * Context: Any context. Takes and releases enc_id_cg_lock.
> > + * Return:
> > + * * >= 0 - Number of bytes processed in the input.
> > + * * -EINVAL - If buf is not valid.
> > + * * -ERANGE - If number is bigger than unsigned int capacity.
> > + * * -EBUSY - If usage can become more than max limit.
> 
> The aboves are stale, right?

-EBUSY is not valid anymore. We can now set max to be less than the usage. I
will remove it in the next patch.

> 
> > +static int enc_id_cg_stat_show(struct seq_file *sf, void *v)
> > +{
> > +	unsigned long flags;
> > +	enum encryption_id_type type = seq_cft(sf)->private;
> > +
> > +	spin_lock_irqsave(&enc_id_cg_lock, flags);
> > +
> > +	seq_printf(sf, "total %u\n", enc_id_capacity[type]);
> > +	seq_printf(sf, "used %u\n", root_cg.res[type].usage);
> 
> Dup with .current and no need to show total on every cgroup, right?

This is for the stat file which will only be seen in the root cgroup
directory.  It is to know overall picture for the resource, what is the
total capacity and what is the current usage. ".current" file is not
shown on the root cgroup.

This information is good for resource allocation in the cloud
infrastructure.

> 
> Thanks.
> 
> -- 
> tejun
Tejun Heo Jan. 16, 2021, 3:43 a.m. UTC | #4
On Fri, Jan 15, 2021 at 02:18:40PM -0800, Vipin Sharma wrote:
> > * Why is .sev a separate namespace? Isn't the controller supposed to cover
> >   encryption ids across different implementations? It's not like multiple
> >   types of IDs can be in use on the same machine, right?
> > 
> 
> On AMD platform we have two types SEV and SEV-ES which can exists
> simultaneously and they have their own quota.

Can you please give a brief explanation of the two and lay out a scenario
where the two are being used / allocated disjointly?

> > > Other ID types can be easily added in the controller in the same way.
> > 
> > I'm not sure this is necessarily a good thing.
> 
> This is to just say that when Intel and PowerPC changes are ready it
> won't be difficult for them to add their controller.

I'm not really enthused about having per-hardware-type control knobs. None
of other controllers behave that way. Unless it can be abstracted into
something common, I'm likely to object.

> > > +static int enc_id_cg_stat_show(struct seq_file *sf, void *v)
> > > +{
> > > +	unsigned long flags;
> > > +	enum encryption_id_type type = seq_cft(sf)->private;
> > > +
> > > +	spin_lock_irqsave(&enc_id_cg_lock, flags);
> > > +
> > > +	seq_printf(sf, "total %u\n", enc_id_capacity[type]);
> > > +	seq_printf(sf, "used %u\n", root_cg.res[type].usage);
> > 
> > Dup with .current and no need to show total on every cgroup, right?
> 
> This is for the stat file which will only be seen in the root cgroup
> directory.  It is to know overall picture for the resource, what is the
> total capacity and what is the current usage. ".current" file is not
> shown on the root cgroup.

Ah, missed the flags. It's odd for the usage to be presented in two
different ways tho. I think it'd make more sense w/ cgroup.current at root
level. Is the total number available somewhere else in the system?

Thanks.
Vipin Sharma Jan. 16, 2021, 4:32 a.m. UTC | #5
On Fri, Jan 15, 2021 at 10:43:32PM -0500, Tejun Heo wrote:
> On Fri, Jan 15, 2021 at 02:18:40PM -0800, Vipin Sharma wrote:
> > > * Why is .sev a separate namespace? Isn't the controller supposed to cover
> > >   encryption ids across different implementations? It's not like multiple
> > >   types of IDs can be in use on the same machine, right?
> > > 
> > 
> > On AMD platform we have two types SEV and SEV-ES which can exists
> > simultaneously and they have their own quota.
> 
> Can you please give a brief explanation of the two and lay out a scenario
> where the two are being used / allocated disjointly?
> 

SEV-ES has stronger memory encryption gurantees compared to SEV, apart
from encrypting the application memory it also encrypts register state
among other things. In a single host ASIDs can be distributed between
these two types by BIOS settings.

Currently, Google Cloud has Confidential VM machines offering using SEV.
ASIDs are not compatible between SEV and SEV-ES, so a VM running on SEV
cannot run on SEV-ES and vice versa

There are use cases for both types of VMs getting used in future.

> > > > Other ID types can be easily added in the controller in the same way.
> > > 
> > > I'm not sure this is necessarily a good thing.
> > 
> > This is to just say that when Intel and PowerPC changes are ready it
> > won't be difficult for them to add their controller.
> 
> I'm not really enthused about having per-hardware-type control knobs. None
> of other controllers behave that way. Unless it can be abstracted into
> something common, I'm likely to object.

There was a discussion in Patch v1 and consensus was to have individual
files because it makes kernel implementation extremely simple.

https://lore.kernel.org/lkml/alpine.DEB.2.23.453.2011131615510.333518@chino.kir.corp.google.com/#t

> 
> > > > +static int enc_id_cg_stat_show(struct seq_file *sf, void *v)
> > > > +{
> > > > +	unsigned long flags;
> > > > +	enum encryption_id_type type = seq_cft(sf)->private;
> > > > +
> > > > +	spin_lock_irqsave(&enc_id_cg_lock, flags);
> > > > +
> > > > +	seq_printf(sf, "total %u\n", enc_id_capacity[type]);
> > > > +	seq_printf(sf, "used %u\n", root_cg.res[type].usage);
> > > 
> > > Dup with .current and no need to show total on every cgroup, right?
> > 
> > This is for the stat file which will only be seen in the root cgroup
> > directory.  It is to know overall picture for the resource, what is the
> > total capacity and what is the current usage. ".current" file is not
> > shown on the root cgroup.
> 
> Ah, missed the flags. It's odd for the usage to be presented in two
> different ways tho. I think it'd make more sense w/ cgroup.current at root
> level. Is the total number available somewhere else in the system?

This information is not available anywhere else in the system. Only
other way to get this value is to use CPUID instruction (0x8000001F) of
the processor. Which also has disdvantage if sev module in kernel
doesn't use all of the available ASIDs for its work (right now it uses
all) then there will be a mismatch between what user get through their
code and what is actually getting used in the kernel by sev.

In cgroup v2, I didn't see current files for other cgroups in root
folder that is why I didn't show that file in root folder.

Will you be fine if I show two files in the root, something like:

encids.sev.capacity
encids.sev.current

In non root folder, it will be:
encids.sev.max
encids.sev.current

I still prefer encids.sev.stat, as it won't repeat same information in
each cgroup but let me know what you think.

Thanks
Tejun Heo Jan. 19, 2021, 3:51 p.m. UTC | #6
Hello,

On Fri, Jan 15, 2021 at 08:32:19PM -0800, Vipin Sharma wrote:
> SEV-ES has stronger memory encryption gurantees compared to SEV, apart
> from encrypting the application memory it also encrypts register state
> among other things. In a single host ASIDs can be distributed between
> these two types by BIOS settings.
> 
> Currently, Google Cloud has Confidential VM machines offering using SEV.
> ASIDs are not compatible between SEV and SEV-ES, so a VM running on SEV
> cannot run on SEV-ES and vice versa
> 
> There are use cases for both types of VMs getting used in future.

Can you please elaborate? I skimmed through the amd manual and it seemed to
say that SEV-ES ASIDs are superset of SEV but !SEV-ES ASIDs. What's the use
case for mixing those two?

> > > > > Other ID types can be easily added in the controller in the same way.
> > > > 
> > > > I'm not sure this is necessarily a good thing.
> > > 
> > > This is to just say that when Intel and PowerPC changes are ready it
> > > won't be difficult for them to add their controller.
> > 
> > I'm not really enthused about having per-hardware-type control knobs. None
> > of other controllers behave that way. Unless it can be abstracted into
> > something common, I'm likely to object.
> 
> There was a discussion in Patch v1 and consensus was to have individual
> files because it makes kernel implementation extremely simple.
> 
> https://lore.kernel.org/lkml/alpine.DEB.2.23.453.2011131615510.333518@chino.kir.corp.google.com/#t

I'm very reluctant to ack vendor specific interfaces for a few reasons but
most importantly because they usually indicate abstraction and/or the
underlying feature not being sufficiently developed and they tend to become
baggages after a while. So, here are my suggestions:

* If there can be a shared abstraction which hopefully makes intuitive
  sense, that'd be ideal. It doesn't have to be one knob but it shouldn't be
  something arbitrary to specific vendors.

* If we aren't there yet and vendor-specific interface is a must, attach
  that part to an interface which is already vendor-aware.

> This information is not available anywhere else in the system. Only
> other way to get this value is to use CPUID instruction (0x8000001F) of
> the processor. Which also has disdvantage if sev module in kernel
> doesn't use all of the available ASIDs for its work (right now it uses
> all) then there will be a mismatch between what user get through their
> code and what is actually getting used in the kernel by sev.
> 
> In cgroup v2, I didn't see current files for other cgroups in root
> folder that is why I didn't show that file in root folder.
> 
> Will you be fine if I show two files in the root, something like:
> 
> encids.sev.capacity
> encids.sev.current
> 
> In non root folder, it will be:
> encids.sev.max
> encids.sev.current
> 
> I still prefer encids.sev.stat, as it won't repeat same information in
> each cgroup but let me know what you think.

Yeah, this will be a first and I was mostly wondering about the same number
appearing under different files / names on root and !root cgroups. I'm
leaning more towards capacity/current but let me think about it a bit more.

Thank you.
Vipin Sharma Jan. 20, 2021, 7:13 a.m. UTC | #7
On Tue, Jan 19, 2021 at 10:51:24AM -0500, Tejun Heo wrote:
> Hello,
> 
> On Fri, Jan 15, 2021 at 08:32:19PM -0800, Vipin Sharma wrote:
> > SEV-ES has stronger memory encryption gurantees compared to SEV, apart
> > from encrypting the application memory it also encrypts register state
> > among other things. In a single host ASIDs can be distributed between
> > these two types by BIOS settings.
> > 
> > Currently, Google Cloud has Confidential VM machines offering using SEV.
> > ASIDs are not compatible between SEV and SEV-ES, so a VM running on SEV
> > cannot run on SEV-ES and vice versa
> > 
> > There are use cases for both types of VMs getting used in future.
> 
> Can you please elaborate? I skimmed through the amd manual and it seemed to
> say that SEV-ES ASIDs are superset of SEV but !SEV-ES ASIDs. What's the use
> case for mixing those two?

For example, customers can be given options for which kind of protection they
want to choose for their workloads based on factors like data protection
requirement, cost, speed, etc.

In terms of features SEV-ES is superset of SEV but that doesn't mean SEV
ASIDs are superset of SEV ASIDs. SEV ASIDs cannot be used for SEV-ES VMs
and similarly SEV-ES ASIDs cannot be used for SEV VMs. Once a system is
booted, based on the BIOS settings each type will have their own
capacity and that number cannot be changed until the next boot and BIOS
changes.

We are not mixing the two types of ASIDs, they are separate and used
separately.

> 
> > > > > > Other ID types can be easily added in the controller in the same way.
> > > > > 
> > > > > I'm not sure this is necessarily a good thing.
> > > > 
> > > > This is to just say that when Intel and PowerPC changes are ready it
> > > > won't be difficult for them to add their controller.
> > > 
> > > I'm not really enthused about having per-hardware-type control knobs. None
> > > of other controllers behave that way. Unless it can be abstracted into
> > > something common, I'm likely to object.
> > 
> > There was a discussion in Patch v1 and consensus was to have individual
> > files because it makes kernel implementation extremely simple.
> > 
> > https://lore.kernel.org/lkml/alpine.DEB.2.23.453.2011131615510.333518@chino.kir.corp.google.com/#t
> 
> I'm very reluctant to ack vendor specific interfaces for a few reasons but
> most importantly because they usually indicate abstraction and/or the
> underlying feature not being sufficiently developed and they tend to become
> baggages after a while. So, here are my suggestions:

My first patch was only for SEV, but soon we got comments that this can
be abstracted and used by TDX and SEID for their use cases.

I see this patch as providing an abstraction for simple accounting of
resources used for creating secure execution contexts. Here, secure
execution is achieved through different means. SEID, TDX, and SEV
provide security using different features and capabilities. I am not
sure if we will reach a point where all three and other vendors will use
the same approach and technology for this purpose.

Instead of each one coming up with their own resource tracking for their
features, this patch is providing a common framework and cgroup for
tracking these resources.

> 
> * If there can be a shared abstraction which hopefully makes intuitive
>   sense, that'd be ideal. It doesn't have to be one knob but it shouldn't be
>   something arbitrary to specific vendors.

I think we should see these as features provided on a host. Tasks can
be executed securely on a host with the guarantees provided by the
specific feature (SEV, SEV-ES, TDX, SEID) used by the task.

I don't think each H/W vendor can agree to a common set of security
guarantees and approach.

> 
> * If we aren't there yet and vendor-specific interface is a must, attach
>   that part to an interface which is already vendor-aware.
Sorry, I don't understand this approach. Can you please give more
details about it?

Thanks
Vipin
Tejun Heo Jan. 20, 2021, 4:40 p.m. UTC | #8
Hello,

On Tue, Jan 19, 2021 at 11:13:51PM -0800, Vipin Sharma wrote:
> > Can you please elaborate? I skimmed through the amd manual and it seemed to
> > say that SEV-ES ASIDs are superset of SEV but !SEV-ES ASIDs. What's the use
> > case for mixing those two?
> 
> For example, customers can be given options for which kind of protection they
> want to choose for their workloads based on factors like data protection
> requirement, cost, speed, etc.

So, I'm looking for is a bit more in-depth analysis than that. ie. What's
the downside of SEV && !SEV-ES and is the disticntion something inherently
useful?

> In terms of features SEV-ES is superset of SEV but that doesn't mean SEV
> ASIDs are superset of SEV ASIDs. SEV ASIDs cannot be used for SEV-ES VMs
> and similarly SEV-ES ASIDs cannot be used for SEV VMs. Once a system is
> booted, based on the BIOS settings each type will have their own
> capacity and that number cannot be changed until the next boot and BIOS
> changes.

Here's an excerpt from the AMD's system programming manual, section 15.35.2:

  On some systems, there is a limitation on which ASID values can be used on
  SEV guests that are run with SEV-ES disabled. While SEV-ES may be enabled
  on any valid SEV ASID (as defined by CPUID Fn8000_001F[ECX]), there are
  restrictions on which ASIDs may be used for SEV guests with SEV- ES
  disabled. CPUID Fn8000_001F[EDX] indicates the minimum ASID value that
  must be used for an SEV-enabled, SEV-ES-disabled guest. For example, if
  CPUID Fn8000_001F[EDX] returns the value 5, then any VMs which use ASIDs
  1-4 and which enable SEV must also enable SEV-ES.

> We are not mixing the two types of ASIDs, they are separate and used
> separately.

Maybe in practice, the key management on the BIOS side is implemented in a
more restricted way but at least the processor manual says differently.

> > I'm very reluctant to ack vendor specific interfaces for a few reasons but
> > most importantly because they usually indicate abstraction and/or the
> > underlying feature not being sufficiently developed and they tend to become
> > baggages after a while. So, here are my suggestions:
> 
> My first patch was only for SEV, but soon we got comments that this can
> be abstracted and used by TDX and SEID for their use cases.
> 
> I see this patch as providing an abstraction for simple accounting of
> resources used for creating secure execution contexts. Here, secure
> execution is achieved through different means. SEID, TDX, and SEV
> provide security using different features and capabilities. I am not
> sure if we will reach a point where all three and other vendors will use
> the same approach and technology for this purpose.
> 
> Instead of each one coming up with their own resource tracking for their
> features, this patch is providing a common framework and cgroup for
> tracking these resources.

What's implemented is a shared place where similar things can be thrown in
bu from user's perspective the underlying hardware feature isn't really
abstracted. It's just exposing whatever hardware knobs there are. If you
look at any other cgroup controllers, nothing is exposing this level of
hardware dependent details and I'd really like to keep it that way.

So, what I'm asking for is more in-depth analysis of the landscape and
inherent differences among different vendor implementations to see whether
there can be better approaches or we should just wait and see.

> > * If there can be a shared abstraction which hopefully makes intuitive
> >   sense, that'd be ideal. It doesn't have to be one knob but it shouldn't be
> >   something arbitrary to specific vendors.
> 
> I think we should see these as features provided on a host. Tasks can
> be executed securely on a host with the guarantees provided by the
> specific feature (SEV, SEV-ES, TDX, SEID) used by the task.
> 
> I don't think each H/W vendor can agree to a common set of security
> guarantees and approach.

Do TDX and SEID have multiple key types tho?

> > * If we aren't there yet and vendor-specific interface is a must, attach
> >   that part to an interface which is already vendor-aware.
> Sorry, I don't understand this approach. Can you please give more
> details about it?

Attaching the interface to kvm side, most likely, instead of exposing the
feature through cgroup.

Thanks.
Vipin Sharma Jan. 20, 2021, 11:18 p.m. UTC | #9
On Wed, Jan 20, 2021 at 11:40:18AM -0500, Tejun Heo wrote:
> Hello,
> 
> On Tue, Jan 19, 2021 at 11:13:51PM -0800, Vipin Sharma wrote:
> > > Can you please elaborate? I skimmed through the amd manual and it seemed to
> > > say that SEV-ES ASIDs are superset of SEV but !SEV-ES ASIDs. What's the use
> > > case for mixing those two?
> > 
> > For example, customers can be given options for which kind of protection they
> > want to choose for their workloads based on factors like data protection
> > requirement, cost, speed, etc.
> 
> So, I'm looking for is a bit more in-depth analysis than that. ie. What's
> the downside of SEV && !SEV-ES and is the disticntion something inherently
> useful?

I will leave this for AMD folks to respond, as they can give much better
answer than me.

> > > I'm very reluctant to ack vendor specific interfaces for a few reasons but
> > > most importantly because they usually indicate abstraction and/or the
> > > underlying feature not being sufficiently developed and they tend to become
> > > baggages after a while. So, here are my suggestions:
> > 
> > My first patch was only for SEV, but soon we got comments that this can
> > be abstracted and used by TDX and SEID for their use cases.
> > 
> > I see this patch as providing an abstraction for simple accounting of
> > resources used for creating secure execution contexts. Here, secure
> > execution is achieved through different means. SEID, TDX, and SEV
> > provide security using different features and capabilities. I am not
> > sure if we will reach a point where all three and other vendors will use
> > the same approach and technology for this purpose.
> > 
> > Instead of each one coming up with their own resource tracking for their
> > features, this patch is providing a common framework and cgroup for
> > tracking these resources.
> 
> What's implemented is a shared place where similar things can be thrown in
> bu from user's perspective the underlying hardware feature isn't really
> abstracted. It's just exposing whatever hardware knobs there are. If you
> look at any other cgroup controllers, nothing is exposing this level of
> hardware dependent details and I'd really like to keep it that way.

RDMA cgroup expose hardware details to users. In rdma.{max, current}
interface files we can see actual hardware names. Only difference
compared to Encryption ID cgroup is that latter is exposing that detail
via file names.

Will you prefer that encryption ID cgroup do things similar to RDMA
cgroup? It can have 3 files
1. encids.capacity (only on root)
   Shows features (SEV, SEV-ES, TDX, SEID) available along with capacity
   on the host.
   $ cat encids.capacity
   sev 400
   sev-es 100

2. encids.max (only on non-root)
   Allows setting of the max value of a feature in the cgroup.
   It will only show max for features shown in the capacity file.
   $ cat encids.max
   sev max
   sev-es 100

3. encids.current (all levels)
   Shows total getting used in the cgroup and its descendants.
   $ cat encids.current
   sev 3
   sev-es 0

> 
> So, what I'm asking for is more in-depth analysis of the landscape and
> inherent differences among different vendor implementations to see whether
> there can be better approaches or we should just wait and see.
> 
> > > * If there can be a shared abstraction which hopefully makes intuitive
> > >   sense, that'd be ideal. It doesn't have to be one knob but it shouldn't be
> > >   something arbitrary to specific vendors.
> > 
> > I think we should see these as features provided on a host. Tasks can
> > be executed securely on a host with the guarantees provided by the
> > specific feature (SEV, SEV-ES, TDX, SEID) used by the task.
> > 
> > I don't think each H/W vendor can agree to a common set of security
> > guarantees and approach.
> 
> Do TDX and SEID have multiple key types tho?
To my limited knowledge I don't think so. I don't know their future
plans.

> 
> > > * If we aren't there yet and vendor-specific interface is a must, attach
> > >   that part to an interface which is already vendor-aware.
> > Sorry, I don't understand this approach. Can you please give more
> > details about it?
> 
> Attaching the interface to kvm side, most likely, instead of exposing the
> feature through cgroup.
I am little confused, do you mean moving files from the kernel/cgroup/
to kvm related directories or you are recommending not to use cgroup at
all?  I hope it is the former :)

Only issue with this is that TDX is not limited to KVM, they have
potential use cases for MKTME without KVM.

Thanks
Vipin
Tejun Heo Jan. 20, 2021, 11:32 p.m. UTC | #10
Hello,

On Wed, Jan 20, 2021 at 03:18:29PM -0800, Vipin Sharma wrote:
> RDMA cgroup expose hardware details to users. In rdma.{max, current}
> interface files we can see actual hardware names. Only difference

No, what's shown is the device name followed by resources which are commonly
defined for all rdma devices. The format is the same as io controller
interface files.

> compared to Encryption ID cgroup is that latter is exposing that detail
> via file names.
> 
> Will you prefer that encryption ID cgroup do things similar to RDMA
> cgroup? It can have 3 files

I don't know how many times I have to repeat the same point to get it
across. For any question about actual abstraction, you haven't provided any
kind of actual research or analysis and just keep pushing the same thing
over and over again. Maybe the situation is such that it makes sense to
change the rule but that needs substantial justifications. I've been asking
to see whether there are such justifications but all I've been getting are
empty answers. Until such discussions take place, please consider the series
nacked and please excuse if I don't respond promptly in this thread.

> > Attaching the interface to kvm side, most likely, instead of exposing the
> > feature through cgroup.
> I am little confused, do you mean moving files from the kernel/cgroup/
> to kvm related directories or you are recommending not to use cgroup at
> all?  I hope it is the former :)
> 
> Only issue with this is that TDX is not limited to KVM, they have
> potential use cases for MKTME without KVM.

There are ways to integrate with cgroup through other interfaces - e.g. take
a look at how bpf works with cgroups. Here, it isn't ideal but may work out
if things actually require a lot of hardware dependent bits. There's also
RDT which exists outside of cgroup for similar reasons.

Thanks.
Tom Lendacky Jan. 21, 2021, 2:55 p.m. UTC | #11
On 1/20/21 10:40 AM, Tejun Heo wrote:
> Hello,
> 
> On Tue, Jan 19, 2021 at 11:13:51PM -0800, Vipin Sharma wrote:
>>> Can you please elaborate? I skimmed through the amd manual and it seemed to
>>> say that SEV-ES ASIDs are superset of SEV but !SEV-ES ASIDs. What's the use
>>> case for mixing those two?
>>
>> For example, customers can be given options for which kind of protection they
>> want to choose for their workloads based on factors like data protection
>> requirement, cost, speed, etc.
> 
> So, I'm looking for is a bit more in-depth analysis than that. ie. What's
> the downside of SEV && !SEV-ES and is the disticntion something inherently
> useful?
> 
>> In terms of features SEV-ES is superset of SEV but that doesn't mean SEV
>> ASIDs are superset of SEV ASIDs. SEV ASIDs cannot be used for SEV-ES VMs
>> and similarly SEV-ES ASIDs cannot be used for SEV VMs. Once a system is
>> booted, based on the BIOS settings each type will have their own
>> capacity and that number cannot be changed until the next boot and BIOS
>> changes.
> 
> Here's an excerpt from the AMD's system programming manual, section 15.35.2:
> 
>    On some systems, there is a limitation on which ASID values can be used on
>    SEV guests that are run with SEV-ES disabled. While SEV-ES may be enabled
>    on any valid SEV ASID (as defined by CPUID Fn8000_001F[ECX]), there are
>    restrictions on which ASIDs may be used for SEV guests with SEV- ES
>    disabled. CPUID Fn8000_001F[EDX] indicates the minimum ASID value that
>    must be used for an SEV-enabled, SEV-ES-disabled guest. For example, if
>    CPUID Fn8000_001F[EDX] returns the value 5, then any VMs which use ASIDs
>    1-4 and which enable SEV must also enable SEV-ES.

The hardware will allow any SEV capable ASID to be run as SEV-ES, however, 
the SEV firmware will not allow the activation of an SEV-ES VM to be 
assigned to an ASID greater than or equal to the SEV minimum ASID value. 
The reason for the latter is to prevent an !SEV-ES ASID starting out as an 
SEV-ES guest and then disabling the SEV-ES VMCB bit that is used by VMRUN. 
This would result in the downgrading of the security of the VM without the 
VM realizing it.

As a result, you have a range of ASIDs that can only run SEV-ES VMs and a 
range of ASIDs that can only run SEV VMs.

Thanks,
Tom

> 
>> We are not mixing the two types of ASIDs, they are separate and used
>> separately.
> 
> Maybe in practice, the key management on the BIOS side is implemented in a
> more restricted way but at least the processor manual says differently.
> 
>>> I'm very reluctant to ack vendor specific interfaces for a few reasons but
>>> most importantly because they usually indicate abstraction and/or the
>>> underlying feature not being sufficiently developed and they tend to become
>>> baggages after a while. So, here are my suggestions:
>>
>> My first patch was only for SEV, but soon we got comments that this can
>> be abstracted and used by TDX and SEID for their use cases.
>>
>> I see this patch as providing an abstraction for simple accounting of
>> resources used for creating secure execution contexts. Here, secure
>> execution is achieved through different means. SEID, TDX, and SEV
>> provide security using different features and capabilities. I am not
>> sure if we will reach a point where all three and other vendors will use
>> the same approach and technology for this purpose.
>>
>> Instead of each one coming up with their own resource tracking for their
>> features, this patch is providing a common framework and cgroup for
>> tracking these resources.
> 
> What's implemented is a shared place where similar things can be thrown in
> bu from user's perspective the underlying hardware feature isn't really
> abstracted. It's just exposing whatever hardware knobs there are. If you
> look at any other cgroup controllers, nothing is exposing this level of
> hardware dependent details and I'd really like to keep it that way.
> 
> So, what I'm asking for is more in-depth analysis of the landscape and
> inherent differences among different vendor implementations to see whether
> there can be better approaches or we should just wait and see.
> 
>>> * If there can be a shared abstraction which hopefully makes intuitive
>>>    sense, that'd be ideal. It doesn't have to be one knob but it shouldn't be
>>>    something arbitrary to specific vendors.
>>
>> I think we should see these as features provided on a host. Tasks can
>> be executed securely on a host with the guarantees provided by the
>> specific feature (SEV, SEV-ES, TDX, SEID) used by the task.
>>
>> I don't think each H/W vendor can agree to a common set of security
>> guarantees and approach.
> 
> Do TDX and SEID have multiple key types tho?
> 
>>> * If we aren't there yet and vendor-specific interface is a must, attach
>>>    that part to an interface which is already vendor-aware.
>> Sorry, I don't understand this approach. Can you please give more
>> details about it?
> 
> Attaching the interface to kvm side, most likely, instead of exposing the
> feature through cgroup.
> 
> Thanks.
>
Tejun Heo Jan. 21, 2021, 3:55 p.m. UTC | #12
Hello,

On Thu, Jan 21, 2021 at 08:55:07AM -0600, Tom Lendacky wrote:
> The hardware will allow any SEV capable ASID to be run as SEV-ES, however,
> the SEV firmware will not allow the activation of an SEV-ES VM to be
> assigned to an ASID greater than or equal to the SEV minimum ASID value. The
> reason for the latter is to prevent an !SEV-ES ASID starting out as an
> SEV-ES guest and then disabling the SEV-ES VMCB bit that is used by VMRUN.
> This would result in the downgrading of the security of the VM without the
> VM realizing it.
> 
> As a result, you have a range of ASIDs that can only run SEV-ES VMs and a
> range of ASIDs that can only run SEV VMs.

I see. That makes sense. What's the downside of SEV-ES compared to SEV w/o
ES? Are there noticeable performance / feature penalties or is the split
mostly for backward compatibility?

Thanks.
Tom Lendacky Jan. 21, 2021, 11:12 p.m. UTC | #13
On 1/21/21 9:55 AM, Tejun Heo wrote:
> Hello,
> 
> On Thu, Jan 21, 2021 at 08:55:07AM -0600, Tom Lendacky wrote:
>> The hardware will allow any SEV capable ASID to be run as SEV-ES, however,
>> the SEV firmware will not allow the activation of an SEV-ES VM to be
>> assigned to an ASID greater than or equal to the SEV minimum ASID value. The
>> reason for the latter is to prevent an !SEV-ES ASID starting out as an
>> SEV-ES guest and then disabling the SEV-ES VMCB bit that is used by VMRUN.
>> This would result in the downgrading of the security of the VM without the
>> VM realizing it.
>>
>> As a result, you have a range of ASIDs that can only run SEV-ES VMs and a
>> range of ASIDs that can only run SEV VMs.
> 
> I see. That makes sense. What's the downside of SEV-ES compared to SEV w/o
> ES? Are there noticeable performance / feature penalties or is the split
> mostly for backward compatibility?

SEV-ES is an incremental enhancement of SEV where the register state of 
the guest is protected/encrypted. As with a lot of performance questions, 
the answer is ...it depends. With SEV-ES, there is additional overhead 
associated with a world switch (VMRUN/VMEXIT) to restore and save 
additional register state. Also, exit events are now divided up into 
automatic exits (AE) and non-automatic exits (NAE). NAE events result in a 
new #VC exception being generated where the guest is then required to use 
the VMGEXIT instruction to communicate only the state necessary to perform 
the operation. A CPUID instruction is a good example, where a shared page 
is used to communicate required state to the hypervisor to perform the 
CPUID emulation, which then returns the results back through the shared 
page to the guest. So it all depends on how often the workload in question 
performs operations that result in a VMEXIT of the vCPU, etc.

Thanks,
Tom

> 
> Thanks.
>
Vipin Sharma Jan. 22, 2021, 12:09 a.m. UTC | #14
On Wed, Jan 20, 2021 at 06:32:56PM -0500, Tejun Heo wrote:
> I don't know how many times I have to repeat the same point to get it
> across. For any question about actual abstraction, you haven't provided any
> kind of actual research or analysis and just keep pushing the same thing
> over and over again. Maybe the situation is such that it makes sense to
> change the rule but that needs substantial justifications. I've been asking
> to see whether there are such justifications but all I've been getting are
> empty answers. Until such discussions take place, please consider the series
> nacked and please excuse if I don't respond promptly in this thread.

I am sorry Tejun that you felt your feedback and questions are being ignored
or not answered properly by me. It was not my intent. Let me try again.

I am not able to come up with an abstraction for underlying the hardware
like we have for memory, cpu, and io with their respective cgroup
controllers, because each vendor is solving VM security issue in
different ways. For example:

s390 is using Ultravisor (UV) to disable access to the VMs memory from
the host.  All KVM interaction with their Protected Virtual Machines
(PVM) are handled through UV APIs. Here an encrypted guest image is
loaded first which is decrypted by UV and then UV disallows access to
PVMs memory and register state from KVM or other PVMs. PVMs are assigned
IDs known as secure execution IDs (SEID).  These IDs are not scarce
resource on the host.

AMD is encrypting runtime memory of a VM using an hardware AES engine in
the memory controller and keys are managed by an Arm based coprocessor
inside the CPU, for encryption and decryption of the data flow between
CPU and memory.  Their offering is known as Secure Encrypted
Virtualization (SEV). There are also two more enhanced offerings SEV-ES,
(memory + guest register state encryption), SEV-SNP (SEV-ES + memory
integrity protection + TCB rollback) in later generation of CPUs. At any
time only a limited number of IDs can be used simultaneously in the
processor. Initially only SEV IDs we available on the CPUs but in the
later generations of CPUs with the addition of SEV-ES, IDs were divided
in two groups SEV ASIDs for SEV guests, and SEV-ES ASIDs for SEV-ES and
SEV-SNP VMs. SEV firmware doesn't allow SEV ASIDs to launch SEV-ES and
SEV-SNP VMs. Ideally, I think its better to use SEV-SNP as it provides
highest protection but support in vmm and guest kernels are not there
yet. Also, old HW will not be able to run SEV-ES or SEV-SNP as they can
only run SEV ASIDs. I dont have data in terms of drawbacks running VM on
SEV-SNP in terms of speed and cost but I think it will be dependent on
workloads.

Intel has come up with Trusted Domain Extension (TDX) for their secure
VMs offering. They allow a VM to use multiple keys for private pages and
for pages shared with other VMs. Overall, this is called as Multi-Key
Total Memory Encryption (MKTME). A fixed number of encryption keys are
supported in MKTME engine. During execution these keys are identified
using KeyIDs which are present in upper bits of platform physical
addresses.

Only limited form of abstraction present here is that all are providing
a way to have secure VMs and processes, either through single key
encryption, multiple key encryptions or access denial.

A common abstraction of different underlying security behavior/approach
can mislead users in giving impression that all secure VMs/processes are
same. In my opinion, this kind of thing can work when we talk about
memory, cpu, etc, but for security related stuff will do more harm to
the end user than the benefit of simplicity of abstraction. The name of
the underlying feature also tells what kind of security guarantees a
user can expect on the platform for a VM and what kind is used.

Taking a step back, in the current scenario, we have some global shared
resources which are limited for SEV, SEV-ES, and TDX. There is also a
need for tracking and controlling on all 4 features for now. This is a
case for some kind of cgroup behavior to limit and control an aggregate
of processes using these system resources. After all, "cgroup is a
mechanism to organize processes hierarchically and distribute system
resources along the hierarchy in a controlled and configurable manner."

We are using SEV in KVM and outside KVM also for other products on
horizon. As cgroups are commonly used in many infrastructures for
resource control, scheduling, and tracking, this patch is helping us in
allocating jobs in the infrastructure along with memory, cpu and other
constraints in a coherent way.

If you feel encryption id cgroup is not good for long term or a
too specific use case then may be there should be a common cgroup which
can be a home for this kind and other kind of future resources where
there is need to limit a global resource allocation but are not abstract
or cannot be abstracted as the other existing cgroups. My current patch
is very generic and with few modifications, it can provide subsystems,
having valid requirements, a capability to use their own simple cgroup
interfaces with minimal code duplication and get robustness of generic
cgroup for free. Here, SEV will be the first user of this generic
cgroup. Need for this is clearly there.

Thanks Vipin
Sean Christopherson Jan. 22, 2021, 1:25 a.m. UTC | #15
On Thu, Jan 21, 2021, Tom Lendacky wrote:
> On 1/21/21 9:55 AM, Tejun Heo wrote:
> > Hello,
> > 
> > On Thu, Jan 21, 2021 at 08:55:07AM -0600, Tom Lendacky wrote:
> > > The hardware will allow any SEV capable ASID to be run as SEV-ES, however,
> > > the SEV firmware will not allow the activation of an SEV-ES VM to be
> > > assigned to an ASID greater than or equal to the SEV minimum ASID value. The
> > > reason for the latter is to prevent an !SEV-ES ASID starting out as an
> > > SEV-ES guest and then disabling the SEV-ES VMCB bit that is used by VMRUN.
> > > This would result in the downgrading of the security of the VM without the
> > > VM realizing it.
> > > 
> > > As a result, you have a range of ASIDs that can only run SEV-ES VMs and a
> > > range of ASIDs that can only run SEV VMs.
> > 
> > I see. That makes sense. What's the downside of SEV-ES compared to SEV w/o
> > ES? Are there noticeable performance / feature penalties or is the split
> > mostly for backward compatibility?
> 
> SEV-ES is an incremental enhancement of SEV where the register state of the
> guest is protected/encrypted. As with a lot of performance questions, the
> answer is ...it depends. 

True, but the expected dual-usage is more about backwards compatibility than
anything else.  Running an SEV-ES VM requires a heavily enlightened guest vBIOS
and kernel, which means that a VM that was created as an SEV guest cannot easily
be converted to an SEV-ES guest, and it would require cooperation from the guest
(if it's even feasible?).

SEV-SNP, another incremental enhancement (on SEV-ES), further strengthens the
argument for SEV and SEV-* coexistenence.  SEV-SNP and SEV-ES will share the
same ASID range, so the question is really, "do we expect to run SEV guests and
any flavor of SEV-* guests on the same platform".  And due to SEV-* not being
directly backward compatible with SEV, the answer will eventually be "yes", as
we'll want to keep running existing SEV guest while also spinning up new SEV-*
guests.

That being said, it's certainly possible to abstract the different key types
between AMD and Intel (assuming s390 won't use the cgroup due to it's plethora
of keys).  TDX private keys are equivalent to SEV-ES ASIDs, and MKTME keys (if
the kernel ever gains a user) could be thrown into the same bucket as SEV IDs,
albeit with some minor mental gymnastics.

E.g. this mapping isn't horrendous:

  encrpytion_ids.basic.*       == SEV   == MKTME
  encrpytion_ids.enhanced.*    == SEV-* == TDX

The names will likely be a bit vague, but I don't think they'll be so far off
that it'd be impossible for someone with SEV/TDX knowledge to glean their intent.
And realistically, if anyone gets to the point where they care about controlling
SEV or TDX IDs, they've already plowed through hundreds of pages of dense
documentation; having to read a few lines of cgroup docs to understand basic vs.
enhanced probably won't faze them at all.
David Rientjes Jan. 26, 2021, 8:49 p.m. UTC | #16
On Thu, 21 Jan 2021, Sean Christopherson wrote:

> True, but the expected dual-usage is more about backwards compatibility than
> anything else.  Running an SEV-ES VM requires a heavily enlightened guest vBIOS
> and kernel, which means that a VM that was created as an SEV guest cannot easily
> be converted to an SEV-ES guest, and it would require cooperation from the guest
> (if it's even feasible?).
> 
> SEV-SNP, another incremental enhancement (on SEV-ES), further strengthens the
> argument for SEV and SEV-* coexistenence.  SEV-SNP and SEV-ES will share the
> same ASID range, so the question is really, "do we expect to run SEV guests and
> any flavor of SEV-* guests on the same platform".  And due to SEV-* not being
> directly backward compatible with SEV, the answer will eventually be "yes", as
> we'll want to keep running existing SEV guest while also spinning up new SEV-*
> guests.
> 

Agreed, cloud providers will most certainly want to run both SEV and SEV-* 
guests on the same platform.

> That being said, it's certainly possible to abstract the different key types
> between AMD and Intel (assuming s390 won't use the cgroup due to it's plethora
> of keys).  TDX private keys are equivalent to SEV-ES ASIDs, and MKTME keys (if
> the kernel ever gains a user) could be thrown into the same bucket as SEV IDs,
> albeit with some minor mental gymnastics.
> 
> E.g. this mapping isn't horrendous:
> 
>   encrpytion_ids.basic.*       == SEV   == MKTME
>   encrpytion_ids.enhanced.*    == SEV-* == TDX
> 
> The names will likely be a bit vague, but I don't think they'll be so far off
> that it'd be impossible for someone with SEV/TDX knowledge to glean their intent.
> And realistically, if anyone gets to the point where they care about controlling
> SEV or TDX IDs, they've already plowed through hundreds of pages of dense
> documentation; having to read a few lines of cgroup docs to understand basic vs.
> enhanced probably won't faze them at all.
> 

The abstraction makes sense for both AMD and Intel offerings today.  It 
makes me wonder if we want a read-only 
encryption_ids.{basic,enhanced}.type file to describe the underlying 
technology ("SEV-ES/SEV-SNP", "TDX", etc).  Since the technology is 
discoverable by other means and we are assuming one encryption type per 
pool of encryption ids, we likely don't need this.

I'm slightly concerned about extensibility if there is to be an 
incremental enhancement atop SEV-* or TDX with yet another pool of 
encryption ids.  (For example, when we only had hugepages, this name was 
perfect; then we got 1GB pages which became "gigantic pages", so are 512GB 
pages "enormous"? :)  I could argue (encryption_ids.basic.*,
encryption_ids.enhanced.*) should map to 
(encryption_ids.legacy.*, encryption_ids.*) but that's likely 
bikeshedding.

Thomas: does encryption_ids.{basic,enhanced}.* make sense for ASID 
partitioning?

Tejun: if this makes sense for legacy SEV and SEV-* per Thomas, and this 
is now abstracted to be technology (vendor) neutral, does this make sense 
to you?
Tejun Heo Jan. 26, 2021, 10:01 p.m. UTC | #17
Hello,

On Tue, Jan 26, 2021 at 12:49:14PM -0800, David Rientjes wrote:
> > SEV-SNP, another incremental enhancement (on SEV-ES), further strengthens the
> > argument for SEV and SEV-* coexistenence.  SEV-SNP and SEV-ES will share the
> > same ASID range, so the question is really, "do we expect to run SEV guests and
> > any flavor of SEV-* guests on the same platform".  And due to SEV-* not being
> > directly backward compatible with SEV, the answer will eventually be "yes", as
> > we'll want to keep running existing SEV guest while also spinning up new SEV-*
> > guests.
> > 
> 
> Agreed, cloud providers will most certainly want to run both SEV and SEV-* 
> guests on the same platform.

Am I correct in thinking that the reason why these IDs are limited is
because they need to be embedded into the page table entries? If so, we
aren't talking about that many IDs and having to divide the already small
pool into disjoint purposes doesn't seem like a particularly smart use of
those bits. It is what it is, I guess.

> I'm slightly concerned about extensibility if there is to be an 
> incremental enhancement atop SEV-* or TDX with yet another pool of 
> encryption ids.  (For example, when we only had hugepages, this name was 
> perfect; then we got 1GB pages which became "gigantic pages", so are 512GB 
> pages "enormous"? :)  I could argue (encryption_ids.basic.*,
> encryption_ids.enhanced.*) should map to 
> (encryption_ids.legacy.*, encryption_ids.*) but that's likely 
> bikeshedding.
> 
> Thomas: does encryption_ids.{basic,enhanced}.* make sense for ASID 
> partitioning?
> 
> Tejun: if this makes sense for legacy SEV and SEV-* per Thomas, and this 
> is now abstracted to be technology (vendor) neutral, does this make sense 
> to you?

The whole thing seems pretty immature to me and I agree with you that coming
up with an abstraction at this stage feels risky.

I'm leaning towards creating a misc controller to shove these things into:

* misc.max and misc.current: nested keyed files listing max and current
  usage for the cgroup.

* Have an API to activate or update a given resource with total resource
  count. I'd much prefer the resource list to be in the controller itself
  rather than being through some dynamic API just so that there is some
  review in what keys get added.

* Top level cgroup lists which resource is active and how many are
  available.

So, behavior-wise, not that different from the proposed code. Just made
generic into a misc controller. Would that work?

Thanks.
Tejun Heo Jan. 26, 2021, 10:02 p.m. UTC | #18
On Tue, Jan 26, 2021 at 05:01:04PM -0500, Tejun Heo wrote:
> * misc.max and misc.current: nested keyed files listing max and current
>   usage for the cgroup.

Keyed files, not nested keyed files.

Thanks.
Vipin Sharma Jan. 27, 2021, 1:11 a.m. UTC | #19
On Tue, Jan 26, 2021 at 05:01:04PM -0500, Tejun Heo wrote:
> The whole thing seems pretty immature to me and I agree with you that coming
> up with an abstraction at this stage feels risky.
> 
> I'm leaning towards creating a misc controller to shove these things into:
> 
> * misc.max and misc.current: nested keyed files listing max and current
>   usage for the cgroup.
> 
> * Have an API to activate or update a given resource with total resource
>   count. I'd much prefer the resource list to be in the controller itself
>   rather than being through some dynamic API just so that there is some
>   review in what keys get added.
> 
> * Top level cgroup lists which resource is active and how many are
>   available.

Sounds good, we can have a single top level stat file

misc.stat
  Shows how many are supported on the host:
  $ cat misc.stat
  sev 500
  sev_es 10

If total value of some resource is 0 then it will be considered inactive and
won't show in misc.{stat, current, max}

We discussed earlier, instead of having "stat" file we should show
"current" and "capacity" files in the root but I think we can just have stat
at top showing total resources to keep it consistent with other cgroup
files.

Thanks
Vipin
Tejun Heo Jan. 27, 2021, 2:10 p.m. UTC | #20
Hello,

On Tue, Jan 26, 2021 at 05:11:59PM -0800, Vipin Sharma wrote:
> Sounds good, we can have a single top level stat file
> 
> misc.stat
>   Shows how many are supported on the host:
>   $ cat misc.stat
>   sev 500
>   sev_es 10
> 
> If total value of some resource is 0 then it will be considered inactive and
> won't show in misc.{stat, current, max}
> 
> We discussed earlier, instead of having "stat" file we should show
> "current" and "capacity" files in the root but I think we can just have stat
> at top showing total resources to keep it consistent with other cgroup
> files.

Let's do misc.capacity and show only the entries which have their resources
initialized.

Thanks.

Patch
diff mbox series

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 9858d5ae9ddd..1924ab2eaf11 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -14,6 +14,7 @@ 
 #include <linux/psp-sev.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
+#include <linux/encryption_ids_cgroup.h>
 #include <linux/processor.h>
 #include <linux/trace_events.h>
 #include <asm/fpu/internal.h>
@@ -86,10 +87,18 @@  static bool __sev_recycle_asids(int min_asid, int max_asid)
 	return true;
 }
 
-static int sev_asid_new(struct kvm_sev_info *sev)
+static int sev_asid_new(struct kvm *kvm)
 {
-	int pos, min_asid, max_asid;
+	int pos, min_asid, max_asid, ret;
+	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
 	bool retry = true;
+	enum encryption_id_type type;
+
+	type = sev->es_active ? ENCRYPTION_ID_SEV_ES : ENCRYPTION_ID_SEV;
+
+	ret = enc_id_cg_try_charge(kvm, type, 1);
+	if (ret)
+		return ret;
 
 	mutex_lock(&sev_bitmap_lock);
 
@@ -107,7 +116,8 @@  static int sev_asid_new(struct kvm_sev_info *sev)
 			goto again;
 		}
 		mutex_unlock(&sev_bitmap_lock);
-		return -EBUSY;
+		ret = -EBUSY;
+		goto e_uncharge;
 	}
 
 	__set_bit(pos, sev_asid_bitmap);
@@ -115,6 +125,9 @@  static int sev_asid_new(struct kvm_sev_info *sev)
 	mutex_unlock(&sev_bitmap_lock);
 
 	return pos + 1;
+e_uncharge:
+	enc_id_cg_uncharge(kvm, type, 1);
+	return ret;
 }
 
 static int sev_get_asid(struct kvm *kvm)
@@ -124,14 +137,16 @@  static int sev_get_asid(struct kvm *kvm)
 	return sev->asid;
 }
 
-static void sev_asid_free(int asid)
+static void sev_asid_free(struct kvm *kvm)
 {
 	struct svm_cpu_data *sd;
 	int cpu, pos;
+	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+	enum encryption_id_type type;
 
 	mutex_lock(&sev_bitmap_lock);
 
-	pos = asid - 1;
+	pos = sev->asid - 1;
 	__set_bit(pos, sev_reclaim_asid_bitmap);
 
 	for_each_possible_cpu(cpu) {
@@ -140,6 +155,9 @@  static void sev_asid_free(int asid)
 	}
 
 	mutex_unlock(&sev_bitmap_lock);
+
+	type = sev->es_active ? ENCRYPTION_ID_SEV_ES : ENCRYPTION_ID_SEV;
+	enc_id_cg_uncharge(kvm, type, 1);
 }
 
 static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
@@ -184,22 +202,22 @@  static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
 	if (unlikely(sev->active))
 		return ret;
 
-	asid = sev_asid_new(sev);
+	asid = sev_asid_new(kvm);
 	if (asid < 0)
 		return ret;
+	sev->asid = asid;
 
 	ret = sev_platform_init(&argp->error);
 	if (ret)
 		goto e_free;
 
 	sev->active = true;
-	sev->asid = asid;
 	INIT_LIST_HEAD(&sev->regions_list);
 
 	return 0;
 
 e_free:
-	sev_asid_free(asid);
+	sev_asid_free(kvm);
 	return ret;
 }
 
@@ -1240,12 +1258,12 @@  void sev_vm_destroy(struct kvm *kvm)
 	mutex_unlock(&kvm->lock);
 
 	sev_unbind_asid(kvm, sev->handle);
-	sev_asid_free(sev->asid);
+	sev_asid_free(kvm);
 }
 
 void __init sev_hardware_setup(void)
 {
-	unsigned int eax, ebx, ecx, edx;
+	unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count;
 	bool sev_es_supported = false;
 	bool sev_supported = false;
 
@@ -1277,7 +1295,11 @@  void __init sev_hardware_setup(void)
 	if (!sev_reclaim_asid_bitmap)
 		goto out;
 
-	pr_info("SEV supported: %u ASIDs\n", max_sev_asid - min_sev_asid + 1);
+	sev_asid_count = max_sev_asid - min_sev_asid + 1;
+	if (enc_id_cg_set_capacity(ENCRYPTION_ID_SEV, sev_asid_count))
+		goto out;
+
+	pr_info("SEV supported: %u ASIDs\n", sev_asid_count);
 	sev_supported = true;
 
 	/* SEV-ES support requested? */
@@ -1292,7 +1314,11 @@  void __init sev_hardware_setup(void)
 	if (min_sev_asid == 1)
 		goto out;
 
-	pr_info("SEV-ES supported: %u ASIDs\n", min_sev_asid - 1);
+	sev_es_asid_count = min_sev_asid - 1;
+	if (enc_id_cg_set_capacity(ENCRYPTION_ID_SEV_ES, sev_es_asid_count))
+		goto out;
+
+	pr_info("SEV-ES supported: %u ASIDs\n", sev_es_asid_count);
 	sev_es_supported = true;
 
 out:
@@ -1307,6 +1333,8 @@  void sev_hardware_teardown(void)
 
 	bitmap_free(sev_asid_bitmap);
 	bitmap_free(sev_reclaim_asid_bitmap);
+	enc_id_cg_set_capacity(ENCRYPTION_ID_SEV, 0);
+	enc_id_cg_set_capacity(ENCRYPTION_ID_SEV_ES, 0);
 
 	sev_flush_asids();
 }
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index acb77dcff3b4..83754f58c05e 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -61,6 +61,10 @@  SUBSYS(pids)
 SUBSYS(rdma)
 #endif
 
+#if IS_ENABLED(CONFIG_CGROUP_ENCRYPTION_IDS)
+SUBSYS(encryption_ids)
+#endif
+
 /*
  * The following subsystems are not supported on the default hierarchy.
  */
diff --git a/include/linux/encryption_ids_cgroup.h b/include/linux/encryption_ids_cgroup.h
new file mode 100644
index 000000000000..af428a4beb28
--- /dev/null
+++ b/include/linux/encryption_ids_cgroup.h
@@ -0,0 +1,72 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Encryption IDs cgroup controller.
+ *
+ * Copyright 2020 Google LLC
+ * Author: Vipin Sharma <vipinsh@google.com>
+ */
+#ifndef _ENCRYPTION_IDS_CGROUP_H_
+#define _ENCRYPTION_IDS_CGROUP_H_
+
+#include <linux/cgroup-defs.h>
+#include <linux/kvm_types.h>
+
+/**
+ * Types of encryption IDs supported by the host.
+ */
+enum encryption_id_type {
+#ifdef CONFIG_KVM_AMD_SEV
+	ENCRYPTION_ID_SEV,
+	ENCRYPTION_ID_SEV_ES,
+#endif
+	ENCRYPTION_ID_TYPES
+};
+
+#ifdef CONFIG_CGROUP_ENCRYPTION_IDS
+
+/**
+ * struct encryption_id_res: Per cgroup per encryption ID resource
+ * @max: Maximum count of encryption ID that can be used.
+ * @usage: Current usage of encryption ID in the cgroup.
+ */
+struct encryption_id_res {
+	unsigned int max;
+	unsigned int usage;
+};
+
+/**
+ * struct encryption_id_cgroup - Encryption IDs controller's cgroup structure.
+ * @css: cgroup subsys state object.
+ * @ids: Array of encryption IDs resource usage in the cgroup.
+ */
+struct encryption_id_cgroup {
+	struct cgroup_subsys_state css;
+	struct encryption_id_res res[ENCRYPTION_ID_TYPES];
+};
+
+int enc_id_cg_set_capacity(enum encryption_id_type type, unsigned int capacity);
+int enc_id_cg_try_charge(struct kvm *kvm, enum encryption_id_type type,
+			 unsigned int amount);
+void enc_id_cg_uncharge(struct kvm *kvm, enum encryption_id_type type,
+			unsigned int amount);
+#else
+static inline int enc_id_cg_set_capacity(enum encryption_id_type type,
+					 unsigned int capacity)
+{
+	return 0;
+}
+
+static inline int enc_id_cg_try_charge(struct kvm *kvm,
+				       enum encryption_id_type type,
+				       unsigned int amount)
+{
+	return 0;
+}
+
+static inline void enc_id_cg_uncharge(struct kvm *kvm,
+				      enum encryption_id_type type,
+				      unsigned int amount)
+{
+}
+#endif /* CONFIG_CGROUP_ENCRYPTION_IDS */
+#endif /* _ENCRYPTION_CGROUP_H_ */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f3b1013fb22c..ae9fde0d4267 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -27,6 +27,7 @@ 
 #include <linux/refcount.h>
 #include <linux/nospec.h>
 #include <asm/signal.h>
+#include <linux/encryption_ids_cgroup.h>
 
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
@@ -513,6 +514,9 @@  struct kvm {
 	pid_t userspace_pid;
 	unsigned int max_halt_poll_ns;
 	u32 dirty_ring_size;
+#ifdef CONFIG_CGROUP_ENCRYPTION_IDS
+	struct encryption_id_cgroup *enc_id_cg;
+#endif
 };
 
 #define kvm_err(fmt, ...) \
diff --git a/init/Kconfig b/init/Kconfig
index b77c60f8b963..6c0bd0e7c08d 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1106,6 +1106,20 @@  config CGROUP_BPF
 	  BPF_CGROUP_INET_INGRESS will be executed on the ingress path of
 	  inet sockets.
 
+config CGROUP_ENCRYPTION_IDS
+	bool "Encryption IDs controller"
+	depends on KVM_AMD_SEV
+	default n
+	help
+	  Provides a controller for CPU encryption IDs on a host.
+
+	  Some platforms have limited number of encryption IDs which can be
+	  used simultaneously, e.g., AMD's Secure Encrypted Virtualization
+	  (SEV). This controller tracks and limits the total number of IDs used
+	  by processes attached to a cgroup hierarchy. For more information,
+	  please check Encryption IDs section in
+	  /Documentation/admin-guide/cgroup-v2.rst.
+
 config CGROUP_DEBUG
 	bool "Debug controller"
 	default n
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index 5d7a76bfbbb7..6c19208dfb7f 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -5,4 +5,5 @@  obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o
 obj-$(CONFIG_CGROUP_PIDS) += pids.o
 obj-$(CONFIG_CGROUP_RDMA) += rdma.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_CGROUP_ENCRYPTION_IDS) += encryption_ids.o
 obj-$(CONFIG_CGROUP_DEBUG) += debug.o
diff --git a/kernel/cgroup/encryption_ids.c b/kernel/cgroup/encryption_ids.c
new file mode 100644
index 000000000000..7cd7d3951bb9
--- /dev/null
+++ b/kernel/cgroup/encryption_ids.c
@@ -0,0 +1,422 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Encryption IDs cgroup controller
+ *
+ * Copyright 2020 Google LLC
+ * Author: Vipin Sharma <vipinsh@google.com>
+ */
+
+#include <linux/limits.h>
+#include <linux/cgroup.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/lockdep.h>
+#include <linux/slab.h>
+#include <linux/kvm_host.h>
+#include <linux/encryption_ids_cgroup.h>
+
+#define MAX_STR "max"
+#define MAX_NUM UINT_MAX
+
+/* Root Encryption ID cgroup */
+static struct encryption_id_cgroup root_cg;
+
+/* Lock for tracking and updating encryption ID resources. */
+static DEFINE_SPINLOCK(enc_id_cg_lock);
+
+/* Encryption ID types capacity. */
+static unsigned int enc_id_capacity[ENCRYPTION_ID_TYPES];
+
+/**
+ * css_enc() - Get encryption ID cgroup from the css.
+ * @css: cgroup subsys state object.
+ *
+ * Context: Any context.
+ * Return:
+ * * %NULL - If @css is null.
+ * * struct encryption_id_cgroup* - Encryption ID cgroup pointer of the passed
+ *				    css.
+ */
+static struct encryption_id_cgroup *css_enc(struct cgroup_subsys_state *css)
+{
+	return css ? container_of(css, struct encryption_id_cgroup, css) : NULL;
+}
+
+/**
+ * parent_enc() - Get the parent of the passed encryption ID cgroup.
+ * @cgroup: cgroup whose parent needs to be fetched.
+ *
+ * Context: Any context.
+ * Return:
+ * * struct encryption_id_cgroup* - Parent of the @cgroup.
+ * * %NULL - If @cgroup is null or the passed cgroup does not have a parent.
+ */
+static struct encryption_id_cgroup *
+parent_enc(struct encryption_id_cgroup *cgroup)
+{
+	return cgroup ? css_enc(cgroup->css.parent) : NULL;
+}
+
+/**
+ * valid_type() - Check if @type is valid or not.
+ * @type: encryption ID type.
+ *
+ * Context: Any context.
+ * Return:
+ * * true - If valid type.
+ * * false - If not valid type.
+ */
+static inline bool valid_type(enum encryption_id_type type)
+{
+	return type >= 0 && type < ENCRYPTION_ID_TYPES;
+}
+
+/**
+ * enc_id_cg_uncharge_hierarchy() - Uncharge the enryption ID cgroup hierarchy.
+ * @start_cg: Starting cgroup.
+ * @stop_cg: cgroup at which uncharge stops.
+ * @type: type of encryption ID to uncharge.
+ * @amount: Charge amount.
+ *
+ * Uncharge the cgroup tree from the given start cgroup to the stop cgroup.
+ *
+ * Context: Any context. Expects enc_id_cg_lock to be held by the caller.
+ */
+static void enc_id_cg_uncharge_hierarchy(struct encryption_id_cgroup *start_cg,
+					 struct encryption_id_cgroup *stop_cg,
+					 enum encryption_id_type type,
+					 unsigned int amount)
+{
+	struct encryption_id_cgroup *i;
+
+	lockdep_assert_held(&enc_id_cg_lock);
+
+	for (i = start_cg; i != stop_cg; i = parent_enc(i)) {
+		WARN_ON_ONCE(i->res[type].usage < amount);
+		i->res[type].usage -= amount;
+	}
+	css_put(&start_cg->css);
+}
+
+/**
+ * enc_id_cg_set_capacity() - Set the capacity of the encryption ID.
+ * @type: Type of the encryption ID.
+ * @capacity: Supported capacity of the encryption ID on the host.
+ *
+ * If capacity is 0 then the charging a cgroup fails for the encryption ID.
+ *
+ * Context: Any context. Takes and releases enc_id_cg_lock.
+ * Return:
+ * * %0 - Successfully registered the capacity.
+ * * %-EINVAL - If @type is invalid.
+ * * %-EBUSY - If current usage is more than the capacity.
+ */
+int enc_id_cg_set_capacity(enum encryption_id_type type, unsigned int capacity)
+{
+	int ret = 0;
+	unsigned long flags;
+
+	if (!valid_type(type))
+		return -EINVAL;
+
+	spin_lock_irqsave(&enc_id_cg_lock, flags);
+
+	if (WARN_ON_ONCE(root_cg.res[type].usage > capacity))
+		ret = -EBUSY;
+	else
+		enc_id_capacity[type] = capacity;
+
+	spin_unlock_irqrestore(&enc_id_cg_lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(enc_id_cg_set_capacity);
+
+/**
+ * enc_id_cg_try_charge() - Try charging encryption ID cgroup.
+ * @kvm: kvm to store charged cgroup.
+ * @type: Encryption ID type to charge.
+ * @amount: Amount to charge.
+ *
+ * Charge @amount to the cgroup to which the current task belongs to. Charged
+ * cgroup will be pointed by @cg. Caller must use the same cgroup during
+ * uncharge call.
+ *
+ * Context: Any context. Takes and releases enc_id_cg_lock.
+ * Return:
+ * * %0 - If successfully charged.
+ * * -EINVAL - If @type is invalid or encryption ID has 0 capacity.
+ * * -EBUSY - If max limit will be crossed or total usage will be more than the
+ *	      capacity.
+ */
+int enc_id_cg_try_charge(struct kvm *kvm, enum encryption_id_type type,
+			 unsigned int amount)
+{
+	struct encryption_id_cgroup *task_cg, *i;
+	struct encryption_id_res *id_res;
+	int ret;
+	unsigned int new_usage;
+	unsigned long flags;
+
+	if (!valid_type(type) || !kvm)
+		return -EINVAL;
+
+	if (!amount)
+		return 0;
+
+	spin_lock_irqsave(&enc_id_cg_lock, flags);
+
+	if (!enc_id_capacity[type]) {
+		ret = -EINVAL;
+		goto err_capacity;
+	}
+
+	task_cg = css_enc(task_get_css(current, encryption_ids_cgrp_id));
+
+	for (i = task_cg; i; i = parent_enc(i)) {
+		id_res = &i->res[type];
+
+		new_usage = id_res->usage + amount;
+		WARN_ON_ONCE(new_usage < id_res->usage);
+
+		if (new_usage > id_res->max ||
+		    new_usage > enc_id_capacity[type]) {
+			ret = -EBUSY;
+			goto err_charge;
+		}
+
+		id_res->usage = new_usage;
+	}
+
+	kvm->enc_id_cg = task_cg;
+	spin_unlock_irqrestore(&enc_id_cg_lock, flags);
+	return 0;
+
+err_charge:
+	enc_id_cg_uncharge_hierarchy(task_cg, i, type, amount);
+err_capacity:
+	spin_unlock_irqrestore(&enc_id_cg_lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(enc_id_cg_try_charge);
+
+/**
+ * enc_id_cg_uncharge() - Uncharge the encryption ID cgroup.
+ * @kvm: kvm containing the corresponding encryption ID cgroup.
+ * @type: Encryption ID which was charged.
+ * @amount: Charged amount.
+ *
+ * Context: Any context. Takes and releases enc_id_cg_lock.
+ */
+void enc_id_cg_uncharge(struct kvm *kvm, enum encryption_id_type type,
+			unsigned int amount)
+{
+	unsigned long flags;
+
+	if (!amount)
+		return;
+	if (!valid_type(type))
+		return;
+	if (!kvm || WARN_ON_ONCE(!(kvm->enc_id_cg)))
+		return;
+
+	spin_lock_irqsave(&enc_id_cg_lock, flags);
+	enc_id_cg_uncharge_hierarchy(kvm->enc_id_cg, NULL, type, amount);
+	spin_unlock_irqrestore(&enc_id_cg_lock, flags);
+
+	kvm->enc_id_cg = NULL;
+}
+EXPORT_SYMBOL(enc_id_cg_uncharge);
+
+/**
+ * enc_id_cg_max_show() - Show encryption ID cgroup max limit.
+ * @sf: Interface file
+ * @v: Arguments passed
+ *
+ * Uses cft->private value to determine for which enryption ID type results be
+ * shown.
+ *
+ * Context: Any context.
+ * Return: 0 to denote successful print.
+ */
+static int enc_id_cg_max_show(struct seq_file *sf, void *v)
+{
+	struct encryption_id_cgroup *cg = css_enc(seq_css(sf));
+	enum encryption_id_type type = seq_cft(sf)->private;
+
+	if (cg->res[type].max == MAX_NUM)
+		seq_printf(sf, "%s\n", MAX_STR);
+	else
+		seq_printf(sf, "%u\n", cg->res[type].max);
+
+	return 0;
+}
+
+/**
+ * enc_id_cg_max_write() - Update the maximum limit of the cgroup.
+ * @of: Handler for the file.
+ * @buf: Data from the user. It should be either "max", 0, or a positive
+ *	 integer.
+ * @nbytes: Number of bytes of the data.
+ * @off: Offset in the file.
+ *
+ * Uses cft->private value to determine for which enryption ID type results be
+ * shown.
+ *
+ * Context: Any context. Takes and releases enc_id_cg_lock.
+ * Return:
+ * * >= 0 - Number of bytes processed in the input.
+ * * -EINVAL - If buf is not valid.
+ * * -ERANGE - If number is bigger than unsigned int capacity.
+ * * -EBUSY - If usage can become more than max limit.
+ */
+static ssize_t enc_id_cg_max_write(struct kernfs_open_file *of, char *buf,
+				   size_t nbytes, loff_t off)
+{
+	struct encryption_id_cgroup *cg;
+	unsigned int max;
+	int ret = 0;
+	enum encryption_id_type type;
+
+	buf = strstrip(buf);
+	if (!strcmp(MAX_STR, buf)) {
+		max = UINT_MAX;
+	} else {
+		ret = kstrtouint(buf, 0, &max);
+		if (ret)
+			return ret;
+	}
+
+	cg = css_enc(of_css(of));
+	type = of_cft(of)->private;
+	cg->res[type].max = max;
+
+	return nbytes;
+}
+
+/**
+ * enc_id_cg_current_read() - Show current usage of the encryption ID.
+ * @css: css pointer of the cgroup.
+ * @cft: cft pointer of the cgroup.
+ *
+ * Uses cft->private value to determine for which enryption ID type results be
+ * shown.
+ *
+ * Context: Any context.
+ * Return: 0 to denote successful print.
+ */
+static u64 enc_id_cg_current_read(struct cgroup_subsys_state *css,
+				  struct cftype *cft)
+{
+	struct encryption_id_cgroup *cg = css_enc(css);
+	enum encryption_id_type type = cft->private;
+
+	return cg->res[type].usage;
+}
+
+/**
+ * enc_id_cg_stat_show() - Show the current stat of the cgroup.
+ * @sf: Interface file
+ * @v: Arguments passed
+ *
+ * Shows the total capacity of the encryption ID and its current usage.
+ * Only shows in root cgroup directory.
+ *
+ * Uses cft->private value to determine for which enryption ID type results be
+ * shown.
+ *
+ * Context: Any context. Takes and releases enc_id_cg_lock.
+ * Return: 0 to denote successful print.
+ */
+static int enc_id_cg_stat_show(struct seq_file *sf, void *v)
+{
+	unsigned long flags;
+	enum encryption_id_type type = seq_cft(sf)->private;
+
+	spin_lock_irqsave(&enc_id_cg_lock, flags);
+
+	seq_printf(sf, "total %u\n", enc_id_capacity[type]);
+	seq_printf(sf, "used %u\n", root_cg.res[type].usage);
+
+	spin_unlock_irqrestore(&enc_id_cg_lock, flags);
+	return 0;
+}
+
+/* Each encryption ID type has these cgroup files. */
+#define ENC_ID_CGROUP_FILES(id_name, id_type)		\
+	[(id_type) * 3] = {				\
+		.name = id_name ".max",			\
+		.write = enc_id_cg_max_write,		\
+		.seq_show = enc_id_cg_max_show,		\
+		.flags = CFTYPE_NOT_ON_ROOT,		\
+		.private = id_type,			\
+	},						\
+	[((id_type) * 3) + 1] = {			\
+		.name = id_name ".current",		\
+		.read_u64 = enc_id_cg_current_read,	\
+		.flags = CFTYPE_NOT_ON_ROOT,		\
+		.private = id_type,			\
+	},						\
+	[((id_type) * 3) + 2] = {			\
+		.name = id_name ".stat",		\
+		.seq_show = enc_id_cg_stat_show,	\
+		.flags = CFTYPE_ONLY_ON_ROOT,		\
+		.private = id_type,			\
+	}
+
+/* Encryption ID cgroup interface files */
+static struct cftype enc_id_cg_files[] = {
+#ifdef CONFIG_KVM_AMD_SEV
+	ENC_ID_CGROUP_FILES("sev", ENCRYPTION_ID_SEV),
+	ENC_ID_CGROUP_FILES("sev_es", ENCRYPTION_ID_SEV_ES),
+#endif
+	{}
+};
+
+/**
+ * enc_id_cg_alloc() - Allocate encryption ID cgroup.
+ * @parent_css: Parent cgroup.
+ *
+ * Context: Process context.
+ * Return:
+ * * struct cgroup_subsys_state* - css of the allocated cgroup.
+ * * ERR_PTR(-ENOMEM) - No memory available to allocate.
+ */
+static struct cgroup_subsys_state *
+enc_id_cg_alloc(struct cgroup_subsys_state *parent_css)
+{
+	enum encryption_id_type i;
+	struct encryption_id_cgroup *cg;
+
+	if (!parent_css) {
+		cg = &root_cg;
+	} else {
+		cg = kzalloc(sizeof(*cg), GFP_KERNEL);
+		if (!cg)
+			return ERR_PTR(-ENOMEM);
+	}
+
+	for (i = 0; i < ENCRYPTION_ID_TYPES; i++)
+		cg->res[i].max = MAX_NUM;
+
+	return &cg->css;
+}
+
+/**
+ * enc_id_cg_free() - Free the encryption ID cgroup.
+ * @css: cgroup subsys object.
+ *
+ * Context: Any context.
+ */
+static void enc_id_cg_free(struct cgroup_subsys_state *css)
+{
+	kfree(css_enc(css));
+}
+
+/* Cgroup controller callbacks */
+struct cgroup_subsys encryption_ids_cgrp_subsys = {
+	.css_alloc = enc_id_cg_alloc,
+	.css_free = enc_id_cg_free,
+	.legacy_cftypes = enc_id_cg_files,
+	.dfl_cftypes = enc_id_cg_files,
+};