linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3] x86/Hyper-V: Support for free page reporting
@ 2021-01-06 23:20 Sunil Muthuswamy
  2021-01-26 17:11 ` Wei Liu
  2021-02-04 23:35 ` Michael Kelley
  0 siblings, 2 replies; 5+ messages in thread
From: Sunil Muthuswamy @ 2021-01-06 23:20 UTC (permalink / raw)
  To: Matheus Castello, linux-hyperv, Haiyang Zhang, Stephen Hemminger,
	Wei Liu, Michael Kelley, Tianyu Lan, Wei Liu, vkuznets
  Cc: KY Srinivasan, linux-kernel

Linux has support for free page reporting now (36e66c554b5c) for
virtualized environment. On Hyper-V when virtually backed VMs are
configured, Hyper-V will advertise cold memory discard capability,
when supported. This patch adds the support to hook into the free
page reporting infrastructure and leverage the Hyper-V cold memory
discard hint hypercall to report/free these pages back to the host.

Signed-off-by: Sunil Muthuswamy <sunilmut@microsoft.com>
Tested-by: Matheus Castello <matheus@castello.eng.br>
---
In V2:
- Addressed feedback comments
- Added page reporting config option tied to hyper-v balloon config

In V3:
- Addressed feedback from Vitaly
---
 arch/x86/hyperv/hv_init.c         | 31 +++++++++++
 arch/x86/kernel/cpu/mshyperv.c    |  6 +-
 drivers/hv/Kconfig                |  1 +
 drivers/hv/hv_balloon.c           | 93 +++++++++++++++++++++++++++++++
 include/asm-generic/hyperv-tlfs.h | 32 ++++++++++-
 include/asm-generic/mshyperv.h    |  2 +
 6 files changed, 162 insertions(+), 3 deletions(-)

diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index e04d90af4c27..5b610e47d091 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -528,3 +528,34 @@ bool hv_is_hibernation_supported(void)
 	return acpi_sleep_state_supported(ACPI_STATE_S4);
 }
 EXPORT_SYMBOL_GPL(hv_is_hibernation_supported);
+
+/* Bit mask of the extended capability to query: see HV_EXT_CAPABILITY_xxx */
+bool hv_query_ext_cap(u64 cap_query)
+{
+	u64 *cap;
+	unsigned long flags;
+	u64 ext_cap = 0;
+
+	/*
+	 * Querying extended capabilities is an extended hypercall. Check if the
+	 * partition supports extended hypercall, first.
+	 */
+	if (!(ms_hyperv.priv_high & HV_ENABLE_EXTENDED_HYPERCALLS))
+		return 0;
+
+	/*
+	 * Repurpose the input page arg to accept output from Hyper-V for
+	 * now because this is the only call that needs output from the
+	 * hypervisor. It should be fixed properly by introducing an
+	 * output arg once we have more places that require output.
+	 */
+	local_irq_save(flags);
+	cap = *(u64 **)this_cpu_ptr(hyperv_pcpu_input_arg);
+	if (hv_do_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, NULL, cap) ==
+	    HV_STATUS_SUCCESS)
+		ext_cap = *cap;
+
+	local_irq_restore(flags);
+	return ext_cap & cap_query;
+}
+EXPORT_SYMBOL_GPL(hv_query_ext_cap);
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 05ef1f4550cb..f4c0d69c61ae 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -225,11 +225,13 @@ static void __init ms_hyperv_init_platform(void)
 	 * Extract the features and hints
 	 */
 	ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
+	ms_hyperv.priv_high = cpuid_ebx(HYPERV_CPUID_FEATURES);
 	ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
 	ms_hyperv.hints    = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
 
-	pr_info("Hyper-V: features 0x%x, hints 0x%x, misc 0x%x\n",
-		ms_hyperv.features, ms_hyperv.hints, ms_hyperv.misc_features);
+	pr_info("Hyper-V: privilege flags low:0x%x, high:0x%x, hints:0x%x, misc:0x%x\n",
+		ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints,
+		ms_hyperv.misc_features);
 
 	ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS);
 	ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS);
diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
index 79e5356a737a..66c794d92391 100644
--- a/drivers/hv/Kconfig
+++ b/drivers/hv/Kconfig
@@ -23,6 +23,7 @@ config HYPERV_UTILS
 config HYPERV_BALLOON
 	tristate "Microsoft Hyper-V Balloon driver"
 	depends on HYPERV
+	select PAGE_REPORTING
 	help
 	  Select this option to enable Hyper-V Balloon driver.
 
diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index 8c471823a5af..c0ff0a48f540 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -21,6 +21,7 @@
 #include <linux/memory.h>
 #include <linux/notifier.h>
 #include <linux/percpu_counter.h>
+#include <linux/page_reporting.h>
 
 #include <linux/hyperv.h>
 #include <asm/hyperv-tlfs.h>
@@ -563,6 +564,10 @@ struct hv_dynmem_device {
 	 * The negotiated version agreed by host.
 	 */
 	__u32 version;
+
+#ifdef CONFIG_PAGE_REPORTING
+	struct page_reporting_dev_info pr_dev_info;
+#endif
 };
 
 static struct hv_dynmem_device dm_device;
@@ -1568,6 +1573,84 @@ static void balloon_onchannelcallback(void *context)
 
 }
 
+#ifdef CONFIG_PAGE_REPORTING
+/* Hyper-V only supports reporting 2MB pages or higher */
+#define HV_MIN_PAGE_REPORTING_ORDER	9
+#define HV_MIN_PAGE_REPORTING_LEN (HV_HYP_PAGE_SIZE << HV_MIN_PAGE_REPORTING_ORDER)
+static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info,
+		    struct scatterlist *sgl, unsigned int nents)
+{
+	unsigned long flags;
+	struct hv_memory_hint *hint;
+	int i;
+	u64 status;
+	struct scatterlist *sg;
+
+	WARN_ON_ONCE(nents > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES);
+	local_irq_save(flags);
+	hint = *(struct hv_memory_hint **)this_cpu_ptr(hyperv_pcpu_input_arg);
+	if (!hint) {
+		local_irq_restore(flags);
+		return -ENOSPC;
+	}
+
+	hint->type = HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD;
+	hint->reserved = 0;
+	for_each_sg(sgl, sg, nents, i) {
+		union hv_gpa_page_range *range;
+
+		range = &hint->ranges[i];
+		range->address_space = 0;
+		/* page reportting only reports 2MB pages or higher */
+		range->page.largepage = 1;
+		range->page.additional_pages =
+			(sg->length / HV_MIN_PAGE_REPORTING_LEN) - 1;
+		range->base_large_pfn =
+			page_to_pfn(sg_page(sg)) >> HV_MIN_PAGE_REPORTING_ORDER;
+	}
+
+	status = hv_do_rep_hypercall(HV_EXT_CALL_MEMORY_HEAT_HINT, nents, 0,
+				     hint, NULL);
+	local_irq_restore(flags);
+	if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) {
+		pr_err("Cold memory discard hypercall failed with status %llx\n",
+			status);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void enable_page_reporting(void)
+{
+	int ret;
+
+	BUILD_BUG_ON(pageblock_order < HV_MIN_PAGE_REPORTING_ORDER);
+	if (!hv_query_ext_cap(HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT)) {
+		pr_debug("Cold memory discard hint not supported by Hyper-V\n");
+		return;
+	}
+
+	BUILD_BUG_ON(PAGE_REPORTING_CAPACITY > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES);
+	dm_device.pr_dev_info.report = hv_free_page_report;
+	ret = page_reporting_register(&dm_device.pr_dev_info);
+	if (ret < 0) {
+		dm_device.pr_dev_info.report = NULL;
+		pr_err("Failed to enable cold memory discard: %d\n", ret);
+	} else {
+		pr_info("Cold memory discard hint enabled\n");
+	}
+}
+
+static void disable_page_reporting(void)
+{
+	if (dm_device.pr_dev_info.report) {
+		page_reporting_unregister(&dm_device.pr_dev_info);
+		dm_device.pr_dev_info.report = NULL;
+	}
+}
+#endif /* CONFIG_PAGE_REPORTING */
+
 static int balloon_connect_vsp(struct hv_device *dev)
 {
 	struct dm_version_request version_req;
@@ -1713,6 +1796,10 @@ static int balloon_probe(struct hv_device *dev,
 	if (ret != 0)
 		return ret;
 
+#ifdef CONFIG_PAGE_REPORTING
+	enable_page_reporting();
+#endif
+
 	dm_device.state = DM_INITIALIZED;
 
 	dm_device.thread =
@@ -1731,6 +1818,9 @@ static int balloon_probe(struct hv_device *dev,
 #ifdef CONFIG_MEMORY_HOTPLUG
 	unregister_memory_notifier(&hv_memory_nb);
 	restore_online_page_callback(&hv_online_page);
+#endif
+#ifdef CONFIG_PAGE_REPORTING
+	disable_page_reporting();
 #endif
 	return ret;
 }
@@ -1753,6 +1843,9 @@ static int balloon_remove(struct hv_device *dev)
 #ifdef CONFIG_MEMORY_HOTPLUG
 	unregister_memory_notifier(&hv_memory_nb);
 	restore_online_page_callback(&hv_online_page);
+#endif
+#ifdef CONFIG_PAGE_REPORTING
+	disable_page_reporting();
 #endif
 	spin_lock_irqsave(&dm_device.ha_lock, flags);
 	list_for_each_entry_safe(has, tmp, &dm->ha_region_list, list) {
diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h
index e73a11850055..75c20be2cc44 100644
--- a/include/asm-generic/hyperv-tlfs.h
+++ b/include/asm-generic/hyperv-tlfs.h
@@ -89,6 +89,7 @@
 #define HV_ACCESS_STATS				BIT(8)
 #define HV_DEBUGGING				BIT(11)
 #define HV_CPU_POWER_MANAGEMENT			BIT(12)
+#define HV_ENABLE_EXTENDED_HYPERCALLS		BIT(20)
 
 
 /*
@@ -152,11 +153,18 @@ struct ms_hyperv_tsc_page {
 #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af
 #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0
 
+/* Extended hypercalls */
+#define HV_EXT_CALL_QUERY_CAPABILITIES		0x8001
+#define HV_EXT_CALL_MEMORY_HEAT_HINT		0x8003
+
 #define HV_FLUSH_ALL_PROCESSORS			BIT(0)
 #define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES	BIT(1)
 #define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY	BIT(2)
 #define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT	BIT(3)
 
+/* Extended capability bits */
+#define HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT BIT(8)
+
 enum HV_GENERIC_SET_FORMAT {
 	HV_GENERIC_SET_SPARSE_4K,
 	HV_GENERIC_SET_ALL,
@@ -367,7 +375,7 @@ struct hv_guest_mapping_flush {
  */
 #define HV_MAX_FLUSH_PAGES (2048)
 
-/* HvFlushGuestPhysicalAddressList hypercall */
+/* HvFlushGuestPhysicalAddressList, HvExtCallMemoryHeatHint hypercall */
 union hv_gpa_page_range {
 	u64 address_space;
 	struct {
@@ -375,6 +383,12 @@ union hv_gpa_page_range {
 		u64 largepage:1;
 		u64 basepfn:52;
 	} page;
+	struct {
+		u64 reserved:12;
+		u64 page_size:1;
+		u64 reserved1:8;
+		u64 base_large_pfn:43;
+	};
 };
 
 /*
@@ -494,4 +508,20 @@ struct hv_set_vp_registers_input {
 	} element[];
 } __packed;
 
+/*
+ * The whole argument should fit in a page to be able to pass to the hypervisor
+ * in one hypercall.
+ */
+#define HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES  \
+	((PAGE_SIZE - sizeof(struct hv_memory_hint)) / \
+		sizeof(union hv_gpa_page_range))
+
+/* HvExtCallMemoryHeatHint hypercall */
+#define HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD	2
+struct hv_memory_hint {
+	u64 type:2;
+	u64 reserved:62;
+	union hv_gpa_page_range ranges[];
+} __packed;
+
 #endif
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index c57799684170..93c1303f5e00 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -27,6 +27,7 @@
 
 struct ms_hyperv_info {
 	u32 features;
+	u32 priv_high;
 	u32 misc_features;
 	u32 hints;
 	u32 nested_features;
@@ -170,6 +171,7 @@ void hyperv_report_panic_msg(phys_addr_t pa, size_t size);
 bool hv_is_hyperv_initialized(void);
 bool hv_is_hibernation_supported(void);
 void hyperv_cleanup(void);
+bool hv_query_ext_cap(u64 cap_query);
 #else /* CONFIG_HYPERV */
 static inline bool hv_is_hyperv_initialized(void) { return false; }
 static inline bool hv_is_hibernation_supported(void) { return false; }
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v3] x86/Hyper-V: Support for free page reporting
  2021-01-06 23:20 [PATCH v3] x86/Hyper-V: Support for free page reporting Sunil Muthuswamy
@ 2021-01-26 17:11 ` Wei Liu
  2021-02-04 23:35 ` Michael Kelley
  1 sibling, 0 replies; 5+ messages in thread
From: Wei Liu @ 2021-01-26 17:11 UTC (permalink / raw)
  To: Sunil Muthuswamy
  Cc: Matheus Castello, linux-hyperv, Haiyang Zhang, Stephen Hemminger,
	Wei Liu, Michael Kelley, Tianyu Lan, Wei Liu, vkuznets,
	KY Srinivasan, linux-kernel

On Wed, Jan 06, 2021 at 11:20:33PM +0000, Sunil Muthuswamy wrote:
> Linux has support for free page reporting now (36e66c554b5c) for
> virtualized environment. On Hyper-V when virtually backed VMs are
> configured, Hyper-V will advertise cold memory discard capability,
> when supported. This patch adds the support to hook into the free
> page reporting infrastructure and leverage the Hyper-V cold memory
> discard hint hypercall to report/free these pages back to the host.
> 
> Signed-off-by: Sunil Muthuswamy <sunilmut@microsoft.com>
> Tested-by: Matheus Castello <matheus@castello.eng.br>

Reviewed-by: Wei Liu <wei.liu@kernel.org>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH v3] x86/Hyper-V: Support for free page reporting
  2021-01-06 23:20 [PATCH v3] x86/Hyper-V: Support for free page reporting Sunil Muthuswamy
  2021-01-26 17:11 ` Wei Liu
@ 2021-02-04 23:35 ` Michael Kelley
  2021-02-05 14:56   ` Michael Kelley
  2021-03-17 20:30   ` Sunil Muthuswamy
  1 sibling, 2 replies; 5+ messages in thread
From: Michael Kelley @ 2021-02-04 23:35 UTC (permalink / raw)
  To: Sunil Muthuswamy, Matheus Castello, linux-hyperv, Haiyang Zhang,
	Stephen Hemminger, Wei Liu, Tianyu Lan, Wei Liu, vkuznets
  Cc: KY Srinivasan, linux-kernel

From: Sunil Muthuswamy <sunilmut@microsoft.com> Sent: Wednesday, January 6, 2021 3:21 PM
> 
> Linux has support for free page reporting now (36e66c554b5c) for
> virtualized environment. On Hyper-V when virtually backed VMs are
> configured, Hyper-V will advertise cold memory discard capability,
> when supported. This patch adds the support to hook into the free
> page reporting infrastructure and leverage the Hyper-V cold memory
> discard hint hypercall to report/free these pages back to the host.
> 
> Signed-off-by: Sunil Muthuswamy <sunilmut@microsoft.com>
> Tested-by: Matheus Castello <matheus@castello.eng.br>
> ---
> In V2:
> - Addressed feedback comments
> - Added page reporting config option tied to hyper-v balloon config
> 
> In V3:
> - Addressed feedback from Vitaly
> ---
>  arch/x86/hyperv/hv_init.c         | 31 +++++++++++
>  arch/x86/kernel/cpu/mshyperv.c    |  6 +-
>  drivers/hv/Kconfig                |  1 +
>  drivers/hv/hv_balloon.c           | 93 +++++++++++++++++++++++++++++++
>  include/asm-generic/hyperv-tlfs.h | 32 ++++++++++-
>  include/asm-generic/mshyperv.h    |  2 +
>  6 files changed, 162 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
> index e04d90af4c27..5b610e47d091 100644
> --- a/arch/x86/hyperv/hv_init.c
> +++ b/arch/x86/hyperv/hv_init.c
> @@ -528,3 +528,34 @@ bool hv_is_hibernation_supported(void)
>  	return acpi_sleep_state_supported(ACPI_STATE_S4);
>  }
>  EXPORT_SYMBOL_GPL(hv_is_hibernation_supported);
> +
> +/* Bit mask of the extended capability to query: see HV_EXT_CAPABILITY_xxx */
> +bool hv_query_ext_cap(u64 cap_query)
> +{
> +	u64 *cap;
> +	unsigned long flags;
> +	u64 ext_cap = 0;
> +
> +	/*
> +	 * Querying extended capabilities is an extended hypercall. Check if the
> +	 * partition supports extended hypercall, first.
> +	 */
> +	if (!(ms_hyperv.priv_high & HV_ENABLE_EXTENDED_HYPERCALLS))
> +		return 0;

Return 'false' since the function is declared as bool?

> +
> +	/*
> +	 * Repurpose the input page arg to accept output from Hyper-V for
> +	 * now because this is the only call that needs output from the
> +	 * hypervisor. It should be fixed properly by introducing an
> +	 * output arg once we have more places that require output.
> +	 */
> +	local_irq_save(flags);
> +	cap = *(u64 **)this_cpu_ptr(hyperv_pcpu_input_arg);
> +	if (hv_do_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, NULL, cap) ==
> +	    HV_STATUS_SUCCESS)

Need to mask before checking for HV_STATUS_SUCCESS.  With regard to the
reserved fields in the returned 64 bit status, the TLFS says "Callers should ignore the
value in these bits".  There's no promise that they are zero.

> +		ext_cap = *cap;
> +
> +	local_irq_restore(flags);
> +	return ext_cap & cap_query;
> +}

As I noted in a review comment back in May, the output arg here is
only 64 bits in size and could just live on the stack with assurance that
it won't cross a page boundary.  So the code could be:

bool hv_query_ext_cap(u64 cap_query)
{
	u64	cap;
	u64	status;

	if(!(ms_hyperv.priv_high & HV_ENABLE_EXTENDED_HYPERCALLS))
		return false;

	status = hv_do_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, NULL, &cap);
	if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS)
		cap = 0;

	return extcap & cap;
}

But if you think there's value in using the designated page for hypercall args,
I'm OK with just fixing the testing of the status.

> +EXPORT_SYMBOL_GPL(hv_query_ext_cap);
> diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
> index 05ef1f4550cb..f4c0d69c61ae 100644
> --- a/arch/x86/kernel/cpu/mshyperv.c
> +++ b/arch/x86/kernel/cpu/mshyperv.c
> @@ -225,11 +225,13 @@ static void __init ms_hyperv_init_platform(void)
>  	 * Extract the features and hints
>  	 */
>  	ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
> +	ms_hyperv.priv_high = cpuid_ebx(HYPERV_CPUID_FEATURES);
>  	ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
>  	ms_hyperv.hints    = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
> 
> -	pr_info("Hyper-V: features 0x%x, hints 0x%x, misc 0x%x\n",
> -		ms_hyperv.features, ms_hyperv.hints, ms_hyperv.misc_features);
> +	pr_info("Hyper-V: privilege flags low:0x%x, high:0x%x, hints:0x%x, misc:0x%x\n",

Nit.  Could we just use a space instead of a colon before each of the printed hex values?

> +		ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints,
> +		ms_hyperv.misc_features);
> 
>  	ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS);
>  	ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS);
> diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
> index 79e5356a737a..66c794d92391 100644
> --- a/drivers/hv/Kconfig
> +++ b/drivers/hv/Kconfig
> @@ -23,6 +23,7 @@ config HYPERV_UTILS
>  config HYPERV_BALLOON
>  	tristate "Microsoft Hyper-V Balloon driver"
>  	depends on HYPERV
> +	select PAGE_REPORTING

With this selection made, are the #ifdef CONFIG_PAGE_REPORTING occurrences
below really needed?  I looked at the virtio balloon driver, which is also does
"select PAGE_REPORTING", and it does not have any #ifdef's.

>  	help
>  	  Select this option to enable Hyper-V Balloon driver.
> 
> diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
> index 8c471823a5af..c0ff0a48f540 100644
> --- a/drivers/hv/hv_balloon.c
> +++ b/drivers/hv/hv_balloon.c
> @@ -21,6 +21,7 @@
>  #include <linux/memory.h>
>  #include <linux/notifier.h>
>  #include <linux/percpu_counter.h>
> +#include <linux/page_reporting.h>
> 
>  #include <linux/hyperv.h>
>  #include <asm/hyperv-tlfs.h>
> @@ -563,6 +564,10 @@ struct hv_dynmem_device {
>  	 * The negotiated version agreed by host.
>  	 */
>  	__u32 version;
> +
> +#ifdef CONFIG_PAGE_REPORTING
> +	struct page_reporting_dev_info pr_dev_info;
> +#endif
>  };
> 
>  static struct hv_dynmem_device dm_device;
> @@ -1568,6 +1573,84 @@ static void balloon_onchannelcallback(void *context)
> 
>  }
> 
> +#ifdef CONFIG_PAGE_REPORTING
> +/* Hyper-V only supports reporting 2MB pages or higher */

I'm guessing the above is the same on ARM64 where the guest is using 16K
or 64K page size, because Hyper-V always uses 4K pages and expects all guest
communication to be in units of 4K pages.

> +#define HV_MIN_PAGE_REPORTING_ORDER	9
> +#define HV_MIN_PAGE_REPORTING_LEN (HV_HYP_PAGE_SIZE << HV_MIN_PAGE_REPORTING_ORDER)
> +static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info,
> +		    struct scatterlist *sgl, unsigned int nents)
> +{
> +	unsigned long flags;
> +	struct hv_memory_hint *hint;
> +	int i;
> +	u64 status;
> +	struct scatterlist *sg;
> +
> +	WARN_ON_ONCE(nents > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES);
> +	local_irq_save(flags);
> +	hint = *(struct hv_memory_hint **)this_cpu_ptr(hyperv_pcpu_input_arg);
> +	if (!hint) {
> +		local_irq_restore(flags);
> +		return -ENOSPC;
> +	}
> +
> +	hint->type = HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD;
> +	hint->reserved = 0;
> +	for_each_sg(sgl, sg, nents, i) {
> +		union hv_gpa_page_range *range;
> +
> +		range = &hint->ranges[i];
> +		range->address_space = 0;
> +		/* page reportting only reports 2MB pages or higher */
> +		range->page.largepage = 1;
> +		range->page.additional_pages =
> +			(sg->length / HV_MIN_PAGE_REPORTING_LEN) - 1;

Perhaps verify that sg->length is at least 2 Meg? (similar to verifying that nents
isn't too big).  If it isn't at least 2 Meg, then additional_pages will get set to -1,
and I suspect weird things will happen.

I was also thinking about whether sg->length could be big enough to overflow
the additional_pages field.  sg->length is an unsigned int, so I don't think so.

> +		range->base_large_pfn =
> +			page_to_pfn(sg_page(sg)) >> HV_MIN_PAGE_REPORTING_ORDER;

page_to_pfn() will do the wrong thing on ARM64 with 16K or 64K pages.
Use page_to_hvpfn() instead.

> +	}
> +
> +	status = hv_do_rep_hypercall(HV_EXT_CALL_MEMORY_HEAT_HINT, nents, 0,
> +				     hint, NULL);
> +	local_irq_restore(flags);
> +	if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) {
> +		pr_err("Cold memory discard hypercall failed with status %llx\n",
> +			status);
> +		return -EINVAL;
> +	}
> +
> +	return 0;
> +}
> +
> +static void enable_page_reporting(void)
> +{
> +	int ret;
> +
> +	BUILD_BUG_ON(pageblock_order < HV_MIN_PAGE_REPORTING_ORDER);

The BUILD_BUG_ON won't work in the case where pageblock_order is
actually a variable rather than a constant, though that's currently only ia64 and
powerpc, which we don't directly care about.  Nonetheless, this would break if
pageblock_order were to become a variable.

> +	if (!hv_query_ext_cap(HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT)) {
> +		pr_debug("Cold memory discard hint not supported by Hyper-V\n");
> +		return;
> +	}
> +
> +	BUILD_BUG_ON(PAGE_REPORTING_CAPACITY > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES);
> +	dm_device.pr_dev_info.report = hv_free_page_report;
> +	ret = page_reporting_register(&dm_device.pr_dev_info);
> +	if (ret < 0) {
> +		dm_device.pr_dev_info.report = NULL;
> +		pr_err("Failed to enable cold memory discard: %d\n", ret);
> +	} else {
> +		pr_info("Cold memory discard hint enabled\n");
> +	}

Should the above two messages be prefixed with "Hyper-V: "?

> +}
> +
> +static void disable_page_reporting(void)
> +{
> +	if (dm_device.pr_dev_info.report) {
> +		page_reporting_unregister(&dm_device.pr_dev_info);
> +		dm_device.pr_dev_info.report = NULL;
> +	}
> +}
> +#endif /* CONFIG_PAGE_REPORTING */
> +
>  static int balloon_connect_vsp(struct hv_device *dev)
>  {
>  	struct dm_version_request version_req;
> @@ -1713,6 +1796,10 @@ static int balloon_probe(struct hv_device *dev,
>  	if (ret != 0)
>  		return ret;
> 
> +#ifdef CONFIG_PAGE_REPORTING
> +	enable_page_reporting();
> +#endif
> +
>  	dm_device.state = DM_INITIALIZED;
> 
>  	dm_device.thread =
> @@ -1731,6 +1818,9 @@ static int balloon_probe(struct hv_device *dev,
>  #ifdef CONFIG_MEMORY_HOTPLUG
>  	unregister_memory_notifier(&hv_memory_nb);
>  	restore_online_page_callback(&hv_online_page);
> +#endif
> +#ifdef CONFIG_PAGE_REPORTING
> +	disable_page_reporting();
>  #endif

Nit:  Typically the error path undoes things in the reverse order. So
the disable_page_reporting() would occur before the call to
vmbus_close().

>  	return ret;
>  }
> @@ -1753,6 +1843,9 @@ static int balloon_remove(struct hv_device *dev)
>  #ifdef CONFIG_MEMORY_HOTPLUG
>  	unregister_memory_notifier(&hv_memory_nb);
>  	restore_online_page_callback(&hv_online_page);
> +#endif
> +#ifdef CONFIG_PAGE_REPORTING
> +	disable_page_reporting();
>  #endif

Same here regarding the ordering.

>  	spin_lock_irqsave(&dm_device.ha_lock, flags);
>  	list_for_each_entry_safe(has, tmp, &dm->ha_region_list, list) {
> diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h
> index e73a11850055..75c20be2cc44 100644
> --- a/include/asm-generic/hyperv-tlfs.h
> +++ b/include/asm-generic/hyperv-tlfs.h
> @@ -89,6 +89,7 @@
>  #define HV_ACCESS_STATS				BIT(8)
>  #define HV_DEBUGGING				BIT(11)
>  #define HV_CPU_POWER_MANAGEMENT			BIT(12)
> +#define HV_ENABLE_EXTENDED_HYPERCALLS		BIT(20)
> 
> 
>  /*
> @@ -152,11 +153,18 @@ struct ms_hyperv_tsc_page {
>  #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af
>  #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0
> 
> +/* Extended hypercalls */
> +#define HV_EXT_CALL_QUERY_CAPABILITIES		0x8001
> +#define HV_EXT_CALL_MEMORY_HEAT_HINT		0x8003
> +
>  #define HV_FLUSH_ALL_PROCESSORS			BIT(0)
>  #define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES	BIT(1)
>  #define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY	BIT(2)
>  #define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT	BIT(3)
> 
> +/* Extended capability bits */
> +#define HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT BIT(8)
> +
>  enum HV_GENERIC_SET_FORMAT {
>  	HV_GENERIC_SET_SPARSE_4K,
>  	HV_GENERIC_SET_ALL,
> @@ -367,7 +375,7 @@ struct hv_guest_mapping_flush {
>   */
>  #define HV_MAX_FLUSH_PAGES (2048)
> 
> -/* HvFlushGuestPhysicalAddressList hypercall */
> +/* HvFlushGuestPhysicalAddressList, HvExtCallMemoryHeatHint hypercall */
>  union hv_gpa_page_range {
>  	u64 address_space;
>  	struct {
> @@ -375,6 +383,12 @@ union hv_gpa_page_range {
>  		u64 largepage:1;
>  		u64 basepfn:52;
>  	} page;
> +	struct {
> +		u64 reserved:12;
> +		u64 page_size:1;
> +		u64 reserved1:8;
> +		u64 base_large_pfn:43;
> +	};
>  };
> 
>  /*
> @@ -494,4 +508,20 @@ struct hv_set_vp_registers_input {
>  	} element[];
>  } __packed;
> 
> +/*
> + * The whole argument should fit in a page to be able to pass to the hypervisor
> + * in one hypercall.
> + */
> +#define HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES  \
> +	((PAGE_SIZE - sizeof(struct hv_memory_hint)) / \

Use HV_HYP_PAGE_SIZE instead of PAGE_SIZE.

> +		sizeof(union hv_gpa_page_range))
> +
> +/* HvExtCallMemoryHeatHint hypercall */
> +#define HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD	2
> +struct hv_memory_hint {
> +	u64 type:2;
> +	u64 reserved:62;
> +	union hv_gpa_page_range ranges[];
> +} __packed;
> +
>  #endif
> diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
> index c57799684170..93c1303f5e00 100644
> --- a/include/asm-generic/mshyperv.h
> +++ b/include/asm-generic/mshyperv.h
> @@ -27,6 +27,7 @@
> 
>  struct ms_hyperv_info {
>  	u32 features;
> +	u32 priv_high;
>  	u32 misc_features;
>  	u32 hints;
>  	u32 nested_features;
> @@ -170,6 +171,7 @@ void hyperv_report_panic_msg(phys_addr_t pa, size_t size);
>  bool hv_is_hyperv_initialized(void);
>  bool hv_is_hibernation_supported(void);
>  void hyperv_cleanup(void);
> +bool hv_query_ext_cap(u64 cap_query);
>  #else /* CONFIG_HYPERV */
>  static inline bool hv_is_hyperv_initialized(void) { return false; }
>  static inline bool hv_is_hibernation_supported(void) { return false; }
> --
> 2.17.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH v3] x86/Hyper-V: Support for free page reporting
  2021-02-04 23:35 ` Michael Kelley
@ 2021-02-05 14:56   ` Michael Kelley
  2021-03-17 20:30   ` Sunil Muthuswamy
  1 sibling, 0 replies; 5+ messages in thread
From: Michael Kelley @ 2021-02-05 14:56 UTC (permalink / raw)
  To: Sunil Muthuswamy, Matheus Castello, linux-hyperv, Haiyang Zhang,
	Stephen Hemminger, Wei Liu, Tianyu Lan, Wei Liu, vkuznets
  Cc: KY Srinivasan, linux-kernel

From: Michael Kelley <mikelley@microsoft.com> Sent: Thursday, February 4, 2021 3:36 PM
> 
> From: Sunil Muthuswamy <sunilmut@microsoft.com> Sent: Wednesday, January 6, 2021
> 3:21 PM
> >
> > Linux has support for free page reporting now (36e66c554b5c) for
> > virtualized environment. On Hyper-V when virtually backed VMs are
> > configured, Hyper-V will advertise cold memory discard capability,
> > when supported. This patch adds the support to hook into the free
> > page reporting infrastructure and leverage the Hyper-V cold memory
> > discard hint hypercall to report/free these pages back to the host.
> >
> > Signed-off-by: Sunil Muthuswamy <sunilmut@microsoft.com>
> > Tested-by: Matheus Castello <matheus@castello.eng.br>
> > ---
> > In V2:
> > - Addressed feedback comments
> > - Added page reporting config option tied to hyper-v balloon config
> >
> > In V3:
> > - Addressed feedback from Vitaly
> > ---
> >  arch/x86/hyperv/hv_init.c         | 31 +++++++++++
> >  arch/x86/kernel/cpu/mshyperv.c    |  6 +-
> >  drivers/hv/Kconfig                |  1 +
> >  drivers/hv/hv_balloon.c           | 93 +++++++++++++++++++++++++++++++
> >  include/asm-generic/hyperv-tlfs.h | 32 ++++++++++-
> >  include/asm-generic/mshyperv.h    |  2 +
> >  6 files changed, 162 insertions(+), 3 deletions(-)
> >

[snip]

> > +
> > +	BUILD_BUG_ON(PAGE_REPORTING_CAPACITY > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES);
> > +	dm_device.pr_dev_info.report = hv_free_page_report;
> > +	ret = page_reporting_register(&dm_device.pr_dev_info);
> > +	if (ret < 0) {
> > +		dm_device.pr_dev_info.report = NULL;
> > +		pr_err("Failed to enable cold memory discard: %d\n", ret);
> > +	} else {
> > +		pr_info("Cold memory discard hint enabled\n");
> > +	}
> 
> Should the above two messages be prefixed with "Hyper-V: "?

Ignore the above comment.  The lines will get prefixed with
"hv_balloon:", which is fine.

Michael

^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH v3] x86/Hyper-V: Support for free page reporting
  2021-02-04 23:35 ` Michael Kelley
  2021-02-05 14:56   ` Michael Kelley
@ 2021-03-17 20:30   ` Sunil Muthuswamy
  1 sibling, 0 replies; 5+ messages in thread
From: Sunil Muthuswamy @ 2021-03-17 20:30 UTC (permalink / raw)
  To: Michael Kelley, Matheus Castello, linux-hyperv, Haiyang Zhang,
	Stephen Hemminger, Wei Liu, Tianyu Lan, Wei Liu, vkuznets
  Cc: KY Srinivasan, linux-kernel

> > +	if (!(ms_hyperv.priv_high & HV_ENABLE_EXTENDED_HYPERCALLS))
> > +		return 0;
> 
> Return 'false' since the function is declared as bool?
Will fix this in the next iteration.

> > +	if (hv_do_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, NULL, cap) ==
> > +	    HV_STATUS_SUCCESS)
> 
> Need to mask before checking for HV_STATUS_SUCCESS.  With regard to the
> reserved fields in the returned 64 bit status, the TLFS says "Callers should ignore the
> value in these bits".  There's no promise that they are zero.
Coming in next version.

> 
> > +		ext_cap = *cap;
> > +
> > +	local_irq_restore(flags);
> > +	return ext_cap & cap_query;
> > +}
> 
> As I noted in a review comment back in May, the output arg here is
> only 64 bits in size and could just live on the stack with assurance that
> it won't cross a page boundary.  So the code could be:
> 
> bool hv_query_ext_cap(u64 cap_query)
> {
> 	u64	cap;
> 	u64	status;
> 
> 	if(!(ms_hyperv.priv_high & HV_ENABLE_EXTENDED_HYPERCALLS))
> 		return false;
> 
> 	status = hv_do_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, NULL, &cap);
> 	if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS)
> 		cap = 0;
> 
> 	return extcap & cap;
> }
> 
> But if you think there's value in using the designated page for hypercall args,
> I'm OK with just fixing the testing of the status.

Hypercall input/output addresses should be 'virt_to_phys' compatible as 'hv_do_hypercall'
will call that on the address to get the physical address, to pass on to the hypervisor. Stack
variables can be virtually allocated and are not compatible with 'virt_to_phys', but we should
be able to use 'static' variable for this. Will address this in next version.

> 
> > -	pr_info("Hyper-V: features 0x%x, hints 0x%x, misc 0x%x\n",
> > -		ms_hyperv.features, ms_hyperv.hints, ms_hyperv.misc_features);
> > +	pr_info("Hyper-V: privilege flags low:0x%x, high:0x%x, hints:0x%x, misc:0x%x\n",
> 
> Nit.  Could we just use a space instead of a colon before each of the printed hex values?
Sure, coming in next version.

> > @@ -23,6 +23,7 @@ config HYPERV_UTILS
> >  config HYPERV_BALLOON
> >  	tristate "Microsoft Hyper-V Balloon driver"
> >  	depends on HYPERV
> > +	select PAGE_REPORTING
> 
> With this selection made, are the #ifdef CONFIG_PAGE_REPORTING occurrences
> below really needed?  I looked at the virtio balloon driver, which is also does
> "select PAGE_REPORTING", and it does not have any #ifdef's.

Good point. Don't think we need extra 'ifdefs' for page reporting now that it is
implicit with Hyper-V Balloon. Coming in next version.

> >  static struct hv_dynmem_device dm_device;
> > @@ -1568,6 +1573,84 @@ static void balloon_onchannelcallback(void *context)
> >
> >  }
> >
> > +#ifdef CONFIG_PAGE_REPORTING
> > +/* Hyper-V only supports reporting 2MB pages or higher */
> 
> I'm guessing the above is the same on ARM64 where the guest is using 16K
> or 64K page size, because Hyper-V always uses 4K pages and expects all guest
> communication to be in units of 4K pages.

Yes.
 
> > +		range->page.additional_pages =
> > +			(sg->length / HV_MIN_PAGE_REPORTING_LEN) - 1;
> 
> Perhaps verify that sg->length is at least 2 Meg? (similar to verifying that nents
> isn't too big).  If it isn't at least 2 Meg, then additional_pages will get set to -1,
> and I suspect weird things will happen.
I will add an assert.

> 
> I was also thinking about whether sg->length could be big enough to overflow
> the additional_pages field.  sg->length is an unsigned int, so I don't think so.
Yes, the additional_pages is designed to accommodate 32-bits.

> 
> > +		range->base_large_pfn =
> > +			page_to_pfn(sg_page(sg)) >> HV_MIN_PAGE_REPORTING_ORDER;
> 
> page_to_pfn() will do the wrong thing on ARM64 with 16K or 64K pages.
> Use page_to_hvpfn() instead.
Good point.

> > +static void enable_page_reporting(void)
> > +{
> > +	int ret;
> > +
> > +	BUILD_BUG_ON(pageblock_order < HV_MIN_PAGE_REPORTING_ORDER);
> 
> The BUILD_BUG_ON won't work in the case where pageblock_order is
> actually a variable rather than a constant, though that's currently only ia64 and
> powerpc, which we don't directly care about.  Nonetheless, this would break if
> pageblock_order were to become a variable.
> 
I have moved this to a conditional statement. The compiler can optimize the code
away when it is a constant.

> > +	if (ret < 0) {
> > +		dm_device.pr_dev_info.report = NULL;
> > +		pr_err("Failed to enable cold memory discard: %d\n", ret);
> > +	} else {
> > +		pr_info("Cold memory discard hint enabled\n");
> > +	}
> 
> Should the above two messages be prefixed with "Hyper-V: "?
Not needed, as you also replied later.

> Nit:  Typically the error path undoes things in the reverse order. So
> the disable_page_reporting() would occur before the call to
> vmbus_close().
Sure.

> 
> >  	return ret;
> >  }
> > @@ -1753,6 +1843,9 @@ static int balloon_remove(struct hv_device *dev)
> >  #ifdef CONFIG_MEMORY_HOTPLUG
> >  	unregister_memory_notifier(&hv_memory_nb);
> >  	restore_online_page_callback(&hv_online_page);
> > +#endif
> > +#ifdef CONFIG_PAGE_REPORTING
> > +	disable_page_reporting();
> >  #endif
> 
> Same here regarding the ordering.
Noted.

> > + * The whole argument should fit in a page to be able to pass to the hypervisor
> > + * in one hypercall.
> > + */
> > +#define HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES  \
> > +	((PAGE_SIZE - sizeof(struct hv_memory_hint)) / \
> 
> Use HV_HYP_PAGE_SIZE instead of PAGE_SIZE.
Done.

Thanks for the review.


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2021-03-17 20:31 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-01-06 23:20 [PATCH v3] x86/Hyper-V: Support for free page reporting Sunil Muthuswamy
2021-01-26 17:11 ` Wei Liu
2021-02-04 23:35 ` Michael Kelley
2021-02-05 14:56   ` Michael Kelley
2021-03-17 20:30   ` Sunil Muthuswamy

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).