kvm.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/4] KVM: MMU: mmu audit code improved
@ 2010-08-28 11:58 Xiao Guangrong
  2010-08-28 12:00 ` [PATCH 1/4] KVM: MMU: support disable/enable mmu audit dynamicly Xiao Guangrong
                   ` (4 more replies)
  0 siblings, 5 replies; 20+ messages in thread
From: Xiao Guangrong @ 2010-08-28 11:58 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, LKML, KVM

The audit can help us to detect the mmu bugs as early as possible, it's also
can help us to fix them.

It's so useful but the linux distribution is imposable to use it since:

- it need enable it by define "AUDIT" macro and compile it
- the audit code is very high overload, it lets the guest mostly hung

So, this patchset supports to enable/disable it dynamically, it's very low
overhead if disable it, and it lowers the audit frequency to assure the guest
running.

After this patchset, we can enable it by:
mount -t debugfs none debugfs
echo 1 > debugfs/kvm/mmu-debug

disable it by:
echo 0 > debugfs/kvm/mmu-debug

default, the audit is disabled

[PATCH 1/4] KVM: MMU: support disable/enable mmu audit dynamically
[PATCH 2/4] KVM: MMU: improve active sp audit
[PATCH 3/4] KVM: MMU: improve spte audit
[PATCH 4/4] KVM: MMU: lower the aduit frequency

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 1/4] KVM: MMU: support disable/enable mmu audit dynamicly
  2010-08-28 11:58 [PATCH 0/4] KVM: MMU: mmu audit code improved Xiao Guangrong
@ 2010-08-28 12:00 ` Xiao Guangrong
  2010-08-29  9:16   ` Avi Kivity
  2010-08-28 12:01 ` [PATCH 2/4] KVM: MMU: improve active sp audit Xiao Guangrong
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 20+ messages in thread
From: Xiao Guangrong @ 2010-08-28 12:00 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, LKML, KVM

Add the debugfs file named 'mmu-debug', we can disable/enable mmu audit by
this file:

enable:
echo 1 > debugfs/kvm/mmu-debug

disable:
echo 0 > debugfs/kvm/mmu-debug

This patch not change the logic

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
 arch/x86/kvm/Kconfig       |    6 +
 arch/x86/kvm/mmu.c         |  250 ++--------------------------------
 arch/x86/kvm/mmu_debug.c   |  329 ++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/mmu_debug.h   |   12 ++
 arch/x86/kvm/mmutrace.h    |   19 +++
 arch/x86/kvm/paging_tmpl.h |    4 +-
 virt/kvm/kvm_main.c        |    6 +-
 7 files changed, 380 insertions(+), 246 deletions(-)
 create mode 100644 arch/x86/kvm/mmu_debug.c
 create mode 100644 arch/x86/kvm/mmu_debug.h

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 970bbd4..67a941d 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -64,6 +64,12 @@ config KVM_AMD
 	  To compile this as a module, choose M here: the module
 	  will be called kvm-amd.
 
+config KVM_MMU_DEBUG
+	bool "Debug KVM MMU"
+	depends on KVM && TRACEPOINTS
+	---help---
+	 This feature allows debug KVM MMU at runtime.
+
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
 source drivers/vhost/Kconfig
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0bff4d5..8609249 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -19,6 +19,7 @@
  */
 
 #include "mmu.h"
+#include "mmu_debug.h"
 #include "x86.h"
 #include "kvm_cache_regs.h"
 
@@ -51,14 +52,6 @@ bool tdp_enabled = false;
 
 #undef MMU_DEBUG
 
-#undef AUDIT
-
-#ifdef AUDIT
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
-#else
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
-#endif
-
 #ifdef MMU_DEBUG
 
 #define pgprintk(x...) do { if (dbg) printk(x); } while (0)
@@ -71,7 +64,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
 
 #endif
 
-#if defined(MMU_DEBUG) || defined(AUDIT)
+#if defined MMU_DEBUG
 static int dbg = 0;
 module_param(dbg, bool, 0644);
 #endif
@@ -2964,7 +2957,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	kvm_mmu_access_page(vcpu, gfn);
 	kvm_mmu_free_some_pages(vcpu);
 	++vcpu->kvm->stat.mmu_pte_write;
-	kvm_mmu_audit(vcpu, "pre pte write");
+	trace_kvm_mmu_audit(vcpu, "pre pte write");
 	if (guest_initiated) {
 		if (gfn == vcpu->arch.last_pt_write_gfn
 		    && !last_updated_pte_accessed(vcpu)) {
@@ -3037,7 +3030,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	}
 	mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
 	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
-	kvm_mmu_audit(vcpu, "post pte write");
+	trace_kvm_mmu_audit(vcpu, "post pte write");
 	spin_unlock(&vcpu->kvm->mmu_lock);
 	if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
 		kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
@@ -3289,6 +3282,7 @@ void kvm_mmu_module_exit(void)
 	mmu_destroy_caches();
 	percpu_counter_destroy(&kvm_total_used_mmu_pages);
 	unregister_shrinker(&mmu_shrinker);
+	mmu_debug_cleanup();
 }
 
 int kvm_mmu_module_init(void)
@@ -3315,6 +3309,8 @@ int kvm_mmu_module_init(void)
 
 	register_shrinker(&mmu_shrinker);
 
+	mmu_debug_init();
+
 	return 0;
 
 nomem:
@@ -3483,234 +3479,6 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
 
-#ifdef AUDIT
-
-static const char *audit_msg;
-
-typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
-
-static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
-			    inspect_spte_fn fn)
-{
-	int i;
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-		u64 ent = sp->spt[i];
-
-		if (is_shadow_present_pte(ent)) {
-			if (!is_last_spte(ent, sp->role.level)) {
-				struct kvm_mmu_page *child;
-				child = page_header(ent & PT64_BASE_ADDR_MASK);
-				__mmu_spte_walk(kvm, child, fn);
-			} else
-				fn(kvm, &sp->spt[i]);
-		}
-	}
-}
-
-static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
-{
-	int i;
-	struct kvm_mmu_page *sp;
-
-	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
-		return;
-	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
-		hpa_t root = vcpu->arch.mmu.root_hpa;
-		sp = page_header(root);
-		__mmu_spte_walk(vcpu->kvm, sp, fn);
-		return;
-	}
-	for (i = 0; i < 4; ++i) {
-		hpa_t root = vcpu->arch.mmu.pae_root[i];
-
-		if (root && VALID_PAGE(root)) {
-			root &= PT64_BASE_ADDR_MASK;
-			sp = page_header(root);
-			__mmu_spte_walk(vcpu->kvm, sp, fn);
-		}
-	}
-	return;
-}
-
-static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
-				gva_t va, int level)
-{
-	u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
-	int i;
-	gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
-		u64 *sptep = pt + i;
-		struct kvm_mmu_page *sp;
-		gfn_t gfn;
-		pfn_t pfn;
-		hpa_t hpa;
-
-		sp = page_header(__pa(sptep));
-
-		if (sp->unsync) {
-			if (level != PT_PAGE_TABLE_LEVEL) {
-				printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
-						audit_msg, sp, level);
-				return;
-			}
-
-			if (*sptep == shadow_notrap_nonpresent_pte) {
-				printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
-						audit_msg, sp);
-				return;
-			}
-		}
-
-		if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
-			printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
-					audit_msg, sp);
-			return;
-		}
-
-		if (!is_shadow_present_pte(*sptep) ||
-		      !is_last_spte(*sptep, level))
-			return;
-
-		gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
-		pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
-
-		if (is_error_pfn(pfn)) {
-			kvm_release_pfn_clean(pfn);
-			return;
-		}
-
-		hpa =  pfn << PAGE_SHIFT;
-
-		if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
-			printk(KERN_ERR "xx audit error: (%s) levels %d"
-					   " gva %lx pfn %llx hpa %llx ent %llxn",
-					   audit_msg, vcpu->arch.mmu.root_level,
-					   va, pfn, hpa, *sptep);
-	}
-}
-
-static void audit_mappings(struct kvm_vcpu *vcpu)
-{
-	unsigned i;
-
-	if (vcpu->arch.mmu.root_level == 4)
-		audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
-	else
-		for (i = 0; i < 4; ++i)
-			if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
-				audit_mappings_page(vcpu,
-						    vcpu->arch.mmu.pae_root[i],
-						    i << 30,
-						    2);
-}
-
-void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
-{
-	unsigned long *rmapp;
-	struct kvm_mmu_page *rev_sp;
-	gfn_t gfn;
-
-
-	rev_sp = page_header(__pa(sptep));
-	gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
-
-	if (!gfn_to_memslot(kvm, gfn)) {
-		if (!printk_ratelimit())
-			return;
-		printk(KERN_ERR "%s: no memslot for gfn %llx\n",
-				 audit_msg, gfn);
-		printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n",
-		       audit_msg, (long int)(sptep - rev_sp->spt),
-				rev_sp->gfn);
-		dump_stack();
-		return;
-	}
-
-	rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
-	if (!*rmapp) {
-		if (!printk_ratelimit())
-			return;
-		printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
-				 audit_msg, *sptep);
-		dump_stack();
-	}
-}
-
-void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
-{
-	mmu_spte_walk(vcpu, inspect_spte_has_rmap);
-}
-
-static void check_mappings_rmap(struct kvm_vcpu *vcpu)
-{
-	struct kvm_mmu_page *sp;
-	int i;
-
-	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
-		u64 *pt = sp->spt;
-
-		if (sp->role.level != PT_PAGE_TABLE_LEVEL)
-			continue;
-
-		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-			if (!is_rmap_spte(pt[i]))
-				continue;
-
-			inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
-		}
-	}
-	return;
-}
-
-static void audit_rmap(struct kvm_vcpu *vcpu)
-{
-	check_mappings_rmap(vcpu);
-}
-
-static void audit_write_protection(struct kvm_vcpu *vcpu)
-{
-	struct kvm_mmu_page *sp;
-	struct kvm_memory_slot *slot;
-	unsigned long *rmapp;
-	u64 *spte;
-
-	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
-		if (sp->role.direct)
-			continue;
-		if (sp->unsync)
-			continue;
-		if (sp->role.invalid)
-			continue;
-
-		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
-		rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
-
-		spte = rmap_next(vcpu->kvm, rmapp, NULL);
-		while (spte) {
-			if (is_writable_pte(*spte))
-				printk(KERN_ERR "%s: (%s) shadow page has "
-				"writable mappings: gfn %llx role %x\n",
-			       __func__, audit_msg, sp->gfn,
-			       sp->role.word);
-			spte = rmap_next(vcpu->kvm, rmapp, spte);
-		}
-	}
-}
-
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
-{
-	int olddbg = dbg;
-
-	dbg = 0;
-	audit_msg = msg;
-	audit_rmap(vcpu);
-	audit_write_protection(vcpu);
-	if (strcmp("pre pte write", audit_msg) != 0)
-		audit_mappings(vcpu);
-	audit_sptes_have_rmaps(vcpu);
-	dbg = olddbg;
-}
-
+#ifdef CONFIG_KVM_MMU_DEBUG
+#include "mmu_debug.c"
 #endif
diff --git a/arch/x86/kvm/mmu_debug.c b/arch/x86/kvm/mmu_debug.c
new file mode 100644
index 0000000..d2c0048
--- /dev/null
+++ b/arch/x86/kvm/mmu_debug.c
@@ -0,0 +1,329 @@
+/*
+ * mmu_debug.c:
+ *
+ * Debug code for KVM MMU
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ *
+ * Authors:
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *   Avi Kivity   <avi@qumranet.com>
+ *   Marcelo Tosatti <mtosatti@redhat.com>
+ *   Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/debugfs.h>
+
+static struct dentry *debugfs_file;
+static bool mmu_debug;
+
+static const char *audit_msg;
+
+typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
+
+static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
+			    inspect_spte_fn fn)
+{
+	int i;
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+		u64 ent = sp->spt[i];
+
+		if (is_shadow_present_pte(ent)) {
+			if (!is_last_spte(ent, sp->role.level)) {
+				struct kvm_mmu_page *child;
+				child = page_header(ent & PT64_BASE_ADDR_MASK);
+				__mmu_spte_walk(kvm, child, fn);
+			} else
+				fn(kvm, &sp->spt[i]);
+		}
+	}
+}
+
+static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
+{
+	int i;
+	struct kvm_mmu_page *sp;
+
+	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+		return;
+	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+		hpa_t root = vcpu->arch.mmu.root_hpa;
+		sp = page_header(root);
+		__mmu_spte_walk(vcpu->kvm, sp, fn);
+		return;
+	}
+	for (i = 0; i < 4; ++i) {
+		hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+		if (root && VALID_PAGE(root)) {
+			root &= PT64_BASE_ADDR_MASK;
+			sp = page_header(root);
+			__mmu_spte_walk(vcpu->kvm, sp, fn);
+		}
+	}
+	return;
+}
+
+static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
+				gva_t va, int level)
+{
+	u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
+	int i;
+	gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
+		u64 *sptep = pt + i;
+		struct kvm_mmu_page *sp;
+		gfn_t gfn;
+		pfn_t pfn;
+		hpa_t hpa;
+
+		sp = page_header(__pa(sptep));
+
+		if (sp->unsync) {
+			if (level != PT_PAGE_TABLE_LEVEL) {
+				printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
+						audit_msg, sp, level);
+				return;
+			}
+
+			if (*sptep == shadow_notrap_nonpresent_pte) {
+				printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
+						audit_msg, sp);
+				return;
+			}
+		}
+
+		if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
+			printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
+					audit_msg, sp);
+			return;
+		}
+
+		if (!is_shadow_present_pte(*sptep) ||
+		      !is_last_spte(*sptep, level))
+			return;
+
+		gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+		pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
+
+		if (is_error_pfn(pfn)) {
+			kvm_release_pfn_clean(pfn);
+			return;
+		}
+
+		hpa =  pfn << PAGE_SHIFT;
+
+		if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
+			printk(KERN_ERR "xx audit error: (%s) levels %d"
+					   " gva %lx pfn %llx hpa %llx ent %llxn",
+					   audit_msg, vcpu->arch.mmu.root_level,
+					   va, pfn, hpa, *sptep);
+	}
+}
+
+static void audit_mappings(struct kvm_vcpu *vcpu)
+{
+	unsigned i;
+
+	if (vcpu->arch.mmu.root_level == 4)
+		audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
+	else
+		for (i = 0; i < 4; ++i)
+			if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
+				audit_mappings_page(vcpu,
+						    vcpu->arch.mmu.pae_root[i],
+						    i << 30,
+						    2);
+}
+
+void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
+{
+	unsigned long *rmapp;
+	struct kvm_mmu_page *rev_sp;
+	gfn_t gfn;
+
+
+	rev_sp = page_header(__pa(sptep));
+	gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
+
+	if (!gfn_to_memslot(kvm, gfn)) {
+		if (!printk_ratelimit())
+			return;
+		printk(KERN_ERR "%s: no memslot for gfn %llx\n",
+				 audit_msg, gfn);
+		printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n",
+		       audit_msg, (long int)(sptep - rev_sp->spt),
+				rev_sp->gfn);
+		dump_stack();
+		return;
+	}
+
+	rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
+	if (!*rmapp) {
+		if (!printk_ratelimit())
+			return;
+		printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
+				 audit_msg, *sptep);
+		dump_stack();
+	}
+}
+
+void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
+{
+	mmu_spte_walk(vcpu, inspect_spte_has_rmap);
+}
+
+static void check_mappings_rmap(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu_page *sp;
+	int i;
+
+	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
+		u64 *pt = sp->spt;
+
+		if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+			continue;
+
+		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+			if (!is_rmap_spte(pt[i]))
+				continue;
+
+			inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
+		}
+	}
+	return;
+}
+
+static void audit_rmap(struct kvm_vcpu *vcpu)
+{
+	check_mappings_rmap(vcpu);
+}
+
+static void audit_write_protection(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu_page *sp;
+	struct kvm_memory_slot *slot;
+	unsigned long *rmapp;
+	u64 *spte;
+
+	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
+		if (sp->role.direct)
+			continue;
+		if (sp->unsync)
+			continue;
+		if (sp->role.invalid)
+			continue;
+
+		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
+		rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
+
+		spte = rmap_next(vcpu->kvm, rmapp, NULL);
+		while (spte) {
+			if (is_writable_pte(*spte))
+				printk(KERN_ERR "%s: (%s) shadow page has "
+				"writable mappings: gfn %llx role %x\n",
+			       __func__, audit_msg, sp->gfn,
+			       sp->role.word);
+			spte = rmap_next(vcpu->kvm, rmapp, spte);
+		}
+	}
+}
+
+static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, const char *msg)
+{
+	audit_msg = msg;
+	audit_rmap(vcpu);
+	audit_write_protection(vcpu);
+	if (strcmp("pre pte write", audit_msg) != 0)
+		audit_mappings(vcpu);
+	audit_sptes_have_rmaps(vcpu);
+}
+
+static void mmu_debug_enable(void)
+{
+	int ret;
+
+	if (mmu_debug)
+		return;
+
+	ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+	WARN_ON(ret);
+
+	mmu_debug = true;
+}
+
+static void mmu_debug_disable(void)
+{
+	if (!mmu_debug)
+		return;
+
+	unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+	tracepoint_synchronize_unregister();
+	mmu_debug = false;
+}
+
+static ssize_t mmu_debug_write(struct file *filp, const char __user *ubuf,
+			       size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	unsigned long val;
+	int ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	switch (val) {
+	case 0:
+		mmu_debug_disable();
+		break;
+	case 1:
+		mmu_debug_enable();
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return cnt;
+}
+
+static ssize_t mmu_debug_read(struct file *filp, char __user *ubuf, size_t cnt,
+			      loff_t *ppos)
+{
+		char buf[64];
+		int r;
+
+		r = sprintf(buf, "%d\n", mmu_debug);
+		return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static const struct file_operations mmu_debug_ops = {
+	.write	= mmu_debug_write,
+	.read	= mmu_debug_read,
+};
+
+void mmu_debug_init(void)
+{
+	debugfs_file = debugfs_create_file("mmu-debug", 0644, kvm_debugfs_dir,
+					   NULL, &mmu_debug_ops);
+}
+
+void mmu_debug_cleanup(void)
+{
+	debugfs_remove(debugfs_file);
+}
diff --git a/arch/x86/kvm/mmu_debug.h b/arch/x86/kvm/mmu_debug.h
new file mode 100644
index 0000000..23f634f
--- /dev/null
+++ b/arch/x86/kvm/mmu_debug.h
@@ -0,0 +1,12 @@
+#ifndef _LINUX_MMU_DEBUG_H
+#define _LINUX_MMU_DEBUG_H
+
+#ifdef CONFIG_KVM_MMU_DEBUG
+void mmu_debug_init(void);
+void mmu_debug_cleanup(void);
+#else
+static inline void mmu_debug_init(void) {};
+static inline void mmu_debug_cleanup(void) {};
+#endif
+
+#endif
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 3aab0f0..28a0e1f 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -195,6 +195,25 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
 
 	TP_ARGS(sp)
 );
+
+TRACE_EVENT(
+	kvm_mmu_audit,
+	TP_PROTO(struct kvm_vcpu *vcpu, const char *msg),
+	TP_ARGS(vcpu, msg),
+
+	TP_STRUCT__entry(
+		__field(struct kvm_vcpu *, vcpu)
+		__field(const char *, msg)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu = vcpu;
+		__entry->msg = msg;
+	),
+
+	TP_printk("%s", __entry->msg)
+);
+
 #endif /* _TRACE_KVMMMU_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a0f2feb..d6f348b 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -542,7 +542,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
 
-	kvm_mmu_audit(vcpu, "pre page fault");
+	trace_kvm_mmu_audit(vcpu, "pre page fault");
 	kvm_mmu_free_some_pages(vcpu);
 	sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
 			     level, &write_pt, pfn);
@@ -554,7 +554,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 		vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
 
 	++vcpu->stat.pf_fixed;
-	kvm_mmu_audit(vcpu, "post page fault (fixed)");
+	trace_kvm_mmu_audit(vcpu, "post page fault (fixed)");
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	return write_pt;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9a73b98..cc7b624 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2262,6 +2262,8 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 	int r;
 	int cpu;
 
+	kvm_init_debug();
+
 	r = kvm_arch_init(opaque);
 	if (r)
 		goto out_fail;
@@ -2346,8 +2348,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 	kvm_preempt_ops.sched_in = kvm_sched_in;
 	kvm_preempt_ops.sched_out = kvm_sched_out;
 
-	kvm_init_debug();
-
 	return 0;
 
 out_free:
@@ -2379,7 +2379,6 @@ EXPORT_SYMBOL_GPL(kvm_init);
 
 void kvm_exit(void)
 {
-	kvm_exit_debug();
 	misc_deregister(&kvm_dev);
 	kmem_cache_destroy(kvm_vcpu_cache);
 	sysdev_unregister(&kvm_sysdev);
@@ -2389,6 +2388,7 @@ void kvm_exit(void)
 	on_each_cpu(hardware_disable, NULL, 1);
 	kvm_arch_hardware_unsetup();
 	kvm_arch_exit();
+	kvm_exit_debug();
 	free_cpumask_var(cpus_hardware_enabled);
 	__free_page(hwpoison_page);
 	__free_page(bad_page);
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 2/4] KVM: MMU: improve active sp audit
  2010-08-28 11:58 [PATCH 0/4] KVM: MMU: mmu audit code improved Xiao Guangrong
  2010-08-28 12:00 ` [PATCH 1/4] KVM: MMU: support disable/enable mmu audit dynamicly Xiao Guangrong
@ 2010-08-28 12:01 ` Xiao Guangrong
  2010-08-28 12:02 ` [PATCH 3/4] KVM: MMU: improve spte audit Xiao Guangrong
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 20+ messages in thread
From: Xiao Guangrong @ 2010-08-28 12:01 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, LKML, KVM

Both audit_rmap() and audit_write_protection() need to walk all active sp, so we can do
these checking in a sp walking

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
 arch/x86/kvm/mmu_debug.c |   80 +++++++++++++++++++++++----------------------
 1 files changed, 41 insertions(+), 39 deletions(-)

diff --git a/arch/x86/kvm/mmu_debug.c b/arch/x86/kvm/mmu_debug.c
index d2c0048..812d6dc 100644
--- a/arch/x86/kvm/mmu_debug.c
+++ b/arch/x86/kvm/mmu_debug.c
@@ -70,6 +70,16 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
 	return;
 }
 
+typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp);
+
+static void walk_all_active_sps(struct kvm *kvm, sp_handler fn)
+{
+	struct kvm_mmu_page *sp;
+
+	list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link)
+		fn(kvm, sp);
+}
+
 static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
 				gva_t va, int level)
 {
@@ -180,67 +190,59 @@ void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
 	mmu_spte_walk(vcpu, inspect_spte_has_rmap);
 }
 
-static void check_mappings_rmap(struct kvm_vcpu *vcpu)
+static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
-	struct kvm_mmu_page *sp;
 	int i;
 
-	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
-		u64 *pt = sp->spt;
-
-		if (sp->role.level != PT_PAGE_TABLE_LEVEL)
-			continue;
+	if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+		return;
 
-		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-			if (!is_rmap_spte(pt[i]))
-				continue;
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+		if (!is_rmap_spte(sp->spt[i]))
+			return;
 
-			inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
-		}
+		inspect_spte_has_rmap(kvm, sp->spt + i);
 	}
-	return;
 }
 
-static void audit_rmap(struct kvm_vcpu *vcpu)
+static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
-	check_mappings_rmap(vcpu);
-}
-
-static void audit_write_protection(struct kvm_vcpu *vcpu)
-{
-	struct kvm_mmu_page *sp;
 	struct kvm_memory_slot *slot;
 	unsigned long *rmapp;
 	u64 *spte;
 
-	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
-		if (sp->role.direct)
-			continue;
-		if (sp->unsync)
-			continue;
-		if (sp->role.invalid)
-			continue;
-
-		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
-		rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
-
-		spte = rmap_next(vcpu->kvm, rmapp, NULL);
-		while (spte) {
-			if (is_writable_pte(*spte))
-				printk(KERN_ERR "%s: (%s) shadow page has "
+	if (sp->role.direct || sp->unsync || sp->role.invalid)
+		return;
+
+	slot = gfn_to_memslot(kvm, sp->gfn);
+	rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
+
+	spte = rmap_next(kvm, rmapp, NULL);
+	while (spte) {
+		if (is_writable_pte(*spte))
+			printk(KERN_ERR "%s: (%s) shadow page has "
 				"writable mappings: gfn %llx role %x\n",
 			       __func__, audit_msg, sp->gfn,
 			       sp->role.word);
-			spte = rmap_next(vcpu->kvm, rmapp, spte);
-		}
+		spte = rmap_next(kvm, rmapp, spte);
 	}
 }
 
+static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+	check_mappings_rmap(kvm, sp);
+	audit_write_protection(kvm, sp);
+}
+
+static void audit_all_active_sps(struct kvm *kvm)
+{
+	walk_all_active_sps(kvm, audit_sp);
+}
+
 static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, const char *msg)
 {
 	audit_msg = msg;
-	audit_rmap(vcpu);
-	audit_write_protection(vcpu);
+	audit_all_active_sps(vcpu->kvm);
 	if (strcmp("pre pte write", audit_msg) != 0)
 		audit_mappings(vcpu);
 	audit_sptes_have_rmaps(vcpu);
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 3/4] KVM: MMU: improve spte audit
  2010-08-28 11:58 [PATCH 0/4] KVM: MMU: mmu audit code improved Xiao Guangrong
  2010-08-28 12:00 ` [PATCH 1/4] KVM: MMU: support disable/enable mmu audit dynamicly Xiao Guangrong
  2010-08-28 12:01 ` [PATCH 2/4] KVM: MMU: improve active sp audit Xiao Guangrong
@ 2010-08-28 12:02 ` Xiao Guangrong
  2010-08-28 12:03 ` [PATCH 4/4] KVM: MMU: lower the aduit frequency Xiao Guangrong
  2010-08-30 10:22 ` [PATCH v2 1/5] KVM: MMU: support disable/enable mmu audit dynamicly Xiao Guangrong
  4 siblings, 0 replies; 20+ messages in thread
From: Xiao Guangrong @ 2010-08-28 12:02 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, LKML, KVM

Both audit_mappings() and audit_sptes_have_rmaps() need to walk vcpu's page table, so we can do
these checking in a spte walking

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
 arch/x86/kvm/mmu_debug.c |  148 +++++++++++++++++++++------------------------
 1 files changed, 69 insertions(+), 79 deletions(-)

diff --git a/arch/x86/kvm/mmu_debug.c b/arch/x86/kvm/mmu_debug.c
index 812d6dc..c4ebe6a 100644
--- a/arch/x86/kvm/mmu_debug.c
+++ b/arch/x86/kvm/mmu_debug.c
@@ -24,23 +24,24 @@ static bool mmu_debug;
 
 static const char *audit_msg;
 
-typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
+typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
 
-static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
-			    inspect_spte_fn fn)
+static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+			    inspect_spte_fn fn, int level)
 {
 	int i;
 
 	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-		u64 ent = sp->spt[i];
-
-		if (is_shadow_present_pte(ent)) {
-			if (!is_last_spte(ent, sp->role.level)) {
-				struct kvm_mmu_page *child;
-				child = page_header(ent & PT64_BASE_ADDR_MASK);
-				__mmu_spte_walk(kvm, child, fn);
-			} else
-				fn(kvm, &sp->spt[i]);
+		u64 *ent = sp->spt;
+
+		fn(vcpu, ent + i, level);
+
+		if (is_shadow_present_pte(ent[i]) &&
+		      !is_last_spte(ent[i], level)) {
+			struct kvm_mmu_page *child;
+
+			child = page_header(ent[i] & PT64_BASE_ADDR_MASK);
+			__mmu_spte_walk(vcpu, child, fn, level - 1);
 		}
 	}
 }
@@ -52,19 +53,21 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
 
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return;
+
 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
 		sp = page_header(root);
-		__mmu_spte_walk(vcpu->kvm, sp, fn);
+		__mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_LEVEL);
 		return;
 	}
+
 	for (i = 0; i < 4; ++i) {
 		hpa_t root = vcpu->arch.mmu.pae_root[i];
 
 		if (root && VALID_PAGE(root)) {
 			root &= PT64_BASE_ADDR_MASK;
 			sp = page_header(root);
-			__mmu_spte_walk(vcpu->kvm, sp, fn);
+			__mmu_spte_walk(vcpu, sp, fn, 2);
 		}
 	}
 	return;
@@ -80,80 +83,56 @@ static void walk_all_active_sps(struct kvm *kvm, sp_handler fn)
 		fn(kvm, sp);
 }
 
-static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
-				gva_t va, int level)
+static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 {
-	u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
-	int i;
-	gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
-		u64 *sptep = pt + i;
-		struct kvm_mmu_page *sp;
-		gfn_t gfn;
-		pfn_t pfn;
-		hpa_t hpa;
-
-		sp = page_header(__pa(sptep));
-
-		if (sp->unsync) {
-			if (level != PT_PAGE_TABLE_LEVEL) {
-				printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
-						audit_msg, sp, level);
-				return;
-			}
-
-			if (*sptep == shadow_notrap_nonpresent_pte) {
-				printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
-						audit_msg, sp);
-				return;
-			}
-		}
+	struct kvm_mmu_page *sp;
+	gfn_t gfn;
+	pfn_t pfn;
+	hpa_t hpa;
 
-		if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
-			printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
-					audit_msg, sp);
+	sp = page_header(__pa(sptep));
+
+	if (sp->unsync) {
+		if (level != PT_PAGE_TABLE_LEVEL) {
+			printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
+					audit_msg, sp, level);
 			return;
 		}
 
-		if (!is_shadow_present_pte(*sptep) ||
-		      !is_last_spte(*sptep, level))
+		if (*sptep == shadow_notrap_nonpresent_pte) {
+			printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
+					audit_msg, sp);
 			return;
+		}
+	}
 
-		gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
-		pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
+	if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
+		printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
+				audit_msg, sp);
+		return;
+	}
 
-		if (is_error_pfn(pfn)) {
-			kvm_release_pfn_clean(pfn);
-			return;
-		}
+	if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level))
+		return;
 
-		hpa =  pfn << PAGE_SHIFT;
+	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+	pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
 
-		if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
-			printk(KERN_ERR "xx audit error: (%s) levels %d"
-					   " gva %lx pfn %llx hpa %llx ent %llxn",
-					   audit_msg, vcpu->arch.mmu.root_level,
-					   va, pfn, hpa, *sptep);
+	if (is_error_pfn(pfn)) {
+		kvm_release_pfn_clean(pfn);
+		return;
 	}
-}
 
-static void audit_mappings(struct kvm_vcpu *vcpu)
-{
-	unsigned i;
-
-	if (vcpu->arch.mmu.root_level == 4)
-		audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
-	else
-		for (i = 0; i < 4; ++i)
-			if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
-				audit_mappings_page(vcpu,
-						    vcpu->arch.mmu.pae_root[i],
-						    i << 30,
-						    2);
+	hpa =  pfn << PAGE_SHIFT;
+
+	if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
+		printk(KERN_ERR "xx audit error: (%s) levels %d"
+			       "pfn %llx hpa %llx ent %llxn",
+			       audit_msg, vcpu->arch.mmu.root_level,
+			       pfn, hpa, *sptep);
 }
 
-void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
+static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 {
 	unsigned long *rmapp;
 	struct kvm_mmu_page *rev_sp;
@@ -185,9 +164,10 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 	}
 }
 
-void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
+static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 {
-	mmu_spte_walk(vcpu, inspect_spte_has_rmap);
+	if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level))
+		inspect_spte_has_rmap(vcpu->kvm, sptep);
 }
 
 static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -239,13 +219,23 @@ static void audit_all_active_sps(struct kvm *kvm)
 	walk_all_active_sps(kvm, audit_sp);
 }
 
+static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level)
+{
+	audit_sptes_have_rmaps(vcpu, sptep, level);
+	audit_mappings(vcpu, sptep, level);
+}
+
+
+static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
+{
+	mmu_spte_walk(vcpu, audit_spte);
+}
+
 static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, const char *msg)
 {
 	audit_msg = msg;
 	audit_all_active_sps(vcpu->kvm);
-	if (strcmp("pre pte write", audit_msg) != 0)
-		audit_mappings(vcpu);
-	audit_sptes_have_rmaps(vcpu);
+	audit_vcpu_spte(vcpu);
 }
 
 static void mmu_debug_enable(void)
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 4/4] KVM: MMU: lower the aduit frequency
  2010-08-28 11:58 [PATCH 0/4] KVM: MMU: mmu audit code improved Xiao Guangrong
                   ` (2 preceding siblings ...)
  2010-08-28 12:02 ` [PATCH 3/4] KVM: MMU: improve spte audit Xiao Guangrong
@ 2010-08-28 12:03 ` Xiao Guangrong
  2010-08-29  9:19   ` Avi Kivity
  2010-08-30 10:22 ` [PATCH v2 1/5] KVM: MMU: support disable/enable mmu audit dynamicly Xiao Guangrong
  4 siblings, 1 reply; 20+ messages in thread
From: Xiao Guangrong @ 2010-08-28 12:03 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, LKML, KVM

The audit is very high overhead, so we need lower the frequency to assure the guest running

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
 arch/x86/kvm/mmu_debug.c |    6 ++++++
 1 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/mmu_debug.c b/arch/x86/kvm/mmu_debug.c
index c4ebe6a..bc61b3d 100644
--- a/arch/x86/kvm/mmu_debug.c
+++ b/arch/x86/kvm/mmu_debug.c
@@ -18,6 +18,7 @@
  */
 
 #include <linux/debugfs.h>
+#include <linux/ratelimit.h>
 
 static struct dentry *debugfs_file;
 static bool mmu_debug;
@@ -233,6 +234,11 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
 
 static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, const char *msg)
 {
+	static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
+
+	if (!__ratelimit(&ratelimit_state))
+		return;
+
 	audit_msg = msg;
 	audit_all_active_sps(vcpu->kvm);
 	audit_vcpu_spte(vcpu);
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/4] KVM: MMU: support disable/enable mmu audit dynamicly
  2010-08-28 12:00 ` [PATCH 1/4] KVM: MMU: support disable/enable mmu audit dynamicly Xiao Guangrong
@ 2010-08-29  9:16   ` Avi Kivity
  2010-08-30  1:58     ` Xiao Guangrong
  0 siblings, 1 reply; 20+ messages in thread
From: Avi Kivity @ 2010-08-29  9:16 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Marcelo Tosatti, LKML, KVM

  On 08/28/2010 03:00 PM, Xiao Guangrong wrote:
> Add the debugfs file named 'mmu-debug', we can disable/enable mmu audit by
> this file:
>
> enable:
> echo 1>  debugfs/kvm/mmu-debug
>
> disable:
> echo 0>  debugfs/kvm/mmu-debug


Better as a runtime rw module parameter perhaps?  At least it avoids the 
large debugfs callbacks.

Also, call it audit to preserve the name.


> This patch not change the logic
>
> Signed-off-by: Xiao Guangrong<xiaoguangrong@cn.fujitsu.com>
> ---
>   arch/x86/kvm/Kconfig       |    6 +
>   arch/x86/kvm/mmu.c         |  250 ++--------------------------------
>   arch/x86/kvm/mmu_debug.c   |  329 ++++++++++++++++++++++++++++++++++++++++++++

Please put the move to mmu_debug in a separate patch.

> +
> +static void mmu_debug_enable(void)
> +{
> +	int ret;
> +
> +	if (mmu_debug)
> +		return;
> +
> +	ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
> +	WARN_ON(ret);
> +
> +	mmu_debug = true;
> +}

Really neat use of tracepoints.

> diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
> index 3aab0f0..28a0e1f 100644
> --- a/arch/x86/kvm/mmutrace.h
> +++ b/arch/x86/kvm/mmutrace.h
> @@ -195,6 +195,25 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
>
>   	TP_ARGS(sp)
>   );
> +
> +TRACE_EVENT(
> +	kvm_mmu_audit,
> +	TP_PROTO(struct kvm_vcpu *vcpu, const char *msg),
> +	TP_ARGS(vcpu, msg),
> +
> +	TP_STRUCT__entry(
> +		__field(struct kvm_vcpu *, vcpu)
> +		__field(const char *, msg)
> +	),

enum instead of char *, maybe something in userspace can make use of this.

> +
> +	TP_fast_assign(
> +		__entry->vcpu = vcpu;
> +		__entry->msg = msg;
> +	),
> +
> +	TP_printk("%s", __entry->msg)

Here, of course, you can use print_symbolic() to preserve readability.


-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 4/4] KVM: MMU: lower the aduit frequency
  2010-08-28 12:03 ` [PATCH 4/4] KVM: MMU: lower the aduit frequency Xiao Guangrong
@ 2010-08-29  9:19   ` Avi Kivity
  2010-08-30  2:16     ` Xiao Guangrong
  0 siblings, 1 reply; 20+ messages in thread
From: Avi Kivity @ 2010-08-29  9:19 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Marcelo Tosatti, LKML, KVM

  On 08/28/2010 03:03 PM, Xiao Guangrong wrote:
> The audit is very high overhead, so we need lower the frequency to assure the guest running
>
>
>    */
>
>   #include<linux/debugfs.h>
> +#include<linux/ratelimit.h>
>
>   static struct dentry *debugfs_file;
>   static bool mmu_debug;
> @@ -233,6 +234,11 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
>
>   static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, const char *msg)
>   {
> +	static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
> +
> +	if (!__ratelimit(&ratelimit_state))
> +		return;
> +
>   	audit_msg = msg;
>   	audit_all_active_sps(vcpu->kvm);
>   	audit_vcpu_spte(vcpu);

This means we see a bug long after it happened, so we can't correlate it 
to the cause.

It's fine as an option (even the default) but I'd like to be able to 
audit after every operation.  Perhaps a partial audit that only looks at 
the gfns and vaddrs that were affected in the last operation?

I have to admit, it's been a very long time since I last used audit.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/4] KVM: MMU: support disable/enable mmu audit dynamicly
  2010-08-29  9:16   ` Avi Kivity
@ 2010-08-30  1:58     ` Xiao Guangrong
  0 siblings, 0 replies; 20+ messages in thread
From: Xiao Guangrong @ 2010-08-30  1:58 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, LKML, KVM

On 08/29/2010 05:16 PM, Avi Kivity wrote:
>  On 08/28/2010 03:00 PM, Xiao Guangrong wrote:
>> Add the debugfs file named 'mmu-debug', we can disable/enable mmu
>> audit by
>> this file:
>>
>> enable:
>> echo 1>  debugfs/kvm/mmu-debug
>>
>> disable:
>> echo 0>  debugfs/kvm/mmu-debug
> 
> 
> Better as a runtime rw module parameter perhaps?  At least it avoids the
> large debugfs callbacks.
> 

Yeah, it's a good idea.

> Also, call it audit to preserve the name.

OK

>> +
>> +TRACE_EVENT(
>> +    kvm_mmu_audit,
>> +    TP_PROTO(struct kvm_vcpu *vcpu, const char *msg),
>> +    TP_ARGS(vcpu, msg),
>> +
>> +    TP_STRUCT__entry(
>> +        __field(struct kvm_vcpu *, vcpu)
>> +        __field(const char *, msg)
>> +    ),
> 
> enum instead of char *, maybe something in userspace can make use of this.
> 

OK

>> +
>> +    TP_fast_assign(
>> +        __entry->vcpu = vcpu;
>> +        __entry->msg = msg;
>> +    ),
>> +
>> +    TP_printk("%s", __entry->msg)
> 
> Here, of course, you can use print_symbolic() to preserve readability.

OK

Will fix them in the next version. 


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 4/4] KVM: MMU: lower the aduit frequency
  2010-08-29  9:19   ` Avi Kivity
@ 2010-08-30  2:16     ` Xiao Guangrong
  2010-08-30  6:59       ` Avi Kivity
  0 siblings, 1 reply; 20+ messages in thread
From: Xiao Guangrong @ 2010-08-30  2:16 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, LKML, KVM

On 08/29/2010 05:19 PM, Avi Kivity wrote:
>  On 08/28/2010 03:03 PM, Xiao Guangrong wrote:
>> The audit is very high overhead, so we need lower the frequency to
>> assure the guest running
>>
>>
>>    */
>>
>>   #include<linux/debugfs.h>
>> +#include<linux/ratelimit.h>
>>
>>   static struct dentry *debugfs_file;
>>   static bool mmu_debug;
>> @@ -233,6 +234,11 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
>>
>>   static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, const
>> char *msg)
>>   {
>> +    static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
>> +
>> +    if (!__ratelimit(&ratelimit_state))
>> +        return;
>> +
>>       audit_msg = msg;
>>       audit_all_active_sps(vcpu->kvm);
>>       audit_vcpu_spte(vcpu);
> 
> This means we see a bug long after it happened, so we can't correlate it
> to the cause.
> 
> It's fine as an option (even the default) but I'd like to be able to
> audit after every operation.  Perhaps a partial audit that only looks at
> the gfns and vaddrs that were affected in the last operation?
> 

Audit checks all the active shadow pages and all vcpu's page table, so the
overload is very high :-)

During my test, if enable the aduit, the guest mostly hung, it means the guest
not do anything.
(Host: Intel(R) Xeon(R) X3430  @ 2.40GHz * 4 + 4G memory
 GUest: x2VCPU + 1G memory
)

I'll set the 'ratelimit' as a module parameter, then if the user's machine is
fast enough, the ratelimit can be disabled.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 4/4] KVM: MMU: lower the aduit frequency
  2010-08-30  2:16     ` Xiao Guangrong
@ 2010-08-30  6:59       ` Avi Kivity
  0 siblings, 0 replies; 20+ messages in thread
From: Avi Kivity @ 2010-08-30  6:59 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Marcelo Tosatti, LKML, KVM

  On 08/30/2010 05:16 AM, Xiao Guangrong wrote:
> On 08/29/2010 05:19 PM, Avi Kivity wrote:
>>   On 08/28/2010 03:03 PM, Xiao Guangrong wrote:
>>> The audit is very high overhead, so we need lower the frequency to
>>> assure the guest running
>>>
>>>
>>>     */
>>>
>>>    #include<linux/debugfs.h>
>>> +#include<linux/ratelimit.h>
>>>
>>>    static struct dentry *debugfs_file;
>>>    static bool mmu_debug;
>>> @@ -233,6 +234,11 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
>>>
>>>    static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, const
>>> char *msg)
>>>    {
>>> +    static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
>>> +
>>> +    if (!__ratelimit(&ratelimit_state))
>>> +        return;
>>> +
>>>        audit_msg = msg;
>>>        audit_all_active_sps(vcpu->kvm);
>>>        audit_vcpu_spte(vcpu);
>> This means we see a bug long after it happened, so we can't correlate it
>> to the cause.
>>
>> It's fine as an option (even the default) but I'd like to be able to
>> audit after every operation.  Perhaps a partial audit that only looks at
>> the gfns and vaddrs that were affected in the last operation?
>>
> Audit checks all the active shadow pages and all vcpu's page table, so the
> overload is very high :-)
>
> During my test, if enable the aduit, the guest mostly hung, it means the guest
> not do anything.
> (Host: Intel(R) Xeon(R) X3430  @ 2.40GHz * 4 + 4G memory
>   GUest: x2VCPU + 1G memory
> )

You're right, I remember that from the last time I used audit many years 
ago.

> I'll set the 'ratelimit' as a module parameter, then if the user's machine is
> fast enough, the ratelimit can be disabled.

It's only useful in very special cases - low memory and a very fast 
reproducer.  I think we can live without the parameter, if someone has 
this special case they can hack the code.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v2 1/5] KVM: MMU: support disable/enable mmu audit dynamicly
  2010-08-28 11:58 [PATCH 0/4] KVM: MMU: mmu audit code improved Xiao Guangrong
                   ` (3 preceding siblings ...)
  2010-08-28 12:03 ` [PATCH 4/4] KVM: MMU: lower the aduit frequency Xiao Guangrong
@ 2010-08-30 10:22 ` Xiao Guangrong
  2010-08-30 10:24   ` [PATCH v2 2/5] KVM: MMU: move audit to a separate file Xiao Guangrong
                     ` (3 more replies)
  4 siblings, 4 replies; 20+ messages in thread
From: Xiao Guangrong @ 2010-08-30 10:22 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, LKML, KVM

Add a r/w module parameter named 'mmu_audit', it can control audit enable/disable:

enable:
echo 1 > /sys/module/kvm/parameters/mmu_audit 

disable:
echo 0 > /sys/module/kvm/parameters/mmu_audit

This patch not change the logic

V2:
  Using r/w module parameter instead of debugfs file

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
 arch/x86/kvm/Kconfig       |    7 +++
 arch/x86/kvm/mmu.c         |   91 +++++++++++++++++++++++++++++++++++---------
 arch/x86/kvm/mmutrace.h    |   19 +++++++++
 arch/x86/kvm/paging_tmpl.h |    4 +-
 4 files changed, 101 insertions(+), 20 deletions(-)

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 970bbd4..ddc131f 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -64,6 +64,13 @@ config KVM_AMD
 	  To compile this as a module, choose M here: the module
 	  will be called kvm-amd.
 
+config KVM_MMU_AUDIT
+	bool "Audit KVM MMU"
+	depends on KVM && TRACEPOINTS
+	---help---
+	 This option adds a R/W kVM module parameter 'mmu_audit', which allows
+	 audit  KVM MMU at runtime.
+
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
 source drivers/vhost/Kconfig
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0bff4d5..8b750ff 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -49,15 +49,21 @@
  */
 bool tdp_enabled = false;
 
-#undef MMU_DEBUG
+enum {
+	AUDIT_PRE_PAGE_FAULT,
+	AUDIT_POST_PAGE_FAULT,
+	AUDIT_PRE_PTE_WRITE,
+	AUDIT_POST_PTE_WRITE
+};
 
-#undef AUDIT
+char *audit_point_name[] = {
+	"pre page fault",
+	"post page fault",
+	"pre pte write",
+	"post pte write"
+};
 
-#ifdef AUDIT
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
-#else
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
-#endif
+#undef MMU_DEBUG
 
 #ifdef MMU_DEBUG
 
@@ -71,7 +77,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
 
 #endif
 
-#if defined(MMU_DEBUG) || defined(AUDIT)
+#ifdef MMU_DEBUG
 static int dbg = 0;
 module_param(dbg, bool, 0644);
 #endif
@@ -2964,7 +2970,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	kvm_mmu_access_page(vcpu, gfn);
 	kvm_mmu_free_some_pages(vcpu);
 	++vcpu->kvm->stat.mmu_pte_write;
-	kvm_mmu_audit(vcpu, "pre pte write");
+	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
 	if (guest_initiated) {
 		if (gfn == vcpu->arch.last_pt_write_gfn
 		    && !last_updated_pte_accessed(vcpu)) {
@@ -3037,7 +3043,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	}
 	mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
 	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
-	kvm_mmu_audit(vcpu, "post pte write");
+	trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 	if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
 		kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
@@ -3483,8 +3489,7 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
 
-#ifdef AUDIT
-
+#ifdef CONFIG_KVM_MMU_AUDIT
 static const char *audit_msg;
 
 typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
@@ -3699,18 +3704,68 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
 	}
 }
 
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
+static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
 {
-	int olddbg = dbg;
-
-	dbg = 0;
-	audit_msg = msg;
+	audit_msg = audit_point_name[audit_point];
 	audit_rmap(vcpu);
 	audit_write_protection(vcpu);
 	if (strcmp("pre pte write", audit_msg) != 0)
 		audit_mappings(vcpu);
 	audit_sptes_have_rmaps(vcpu);
-	dbg = olddbg;
 }
 
+static bool mmu_audit;
+
+static void mmu_audit_enable(void)
+{
+	int ret;
+
+	if (mmu_audit)
+		return;
+
+	ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+	WARN_ON(ret);
+
+	mmu_audit = true;
+}
+
+static void mmu_audit_disable(void)
+{
+	if (!mmu_audit)
+		return;
+
+	unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+	tracepoint_synchronize_unregister();
+	mmu_audit = false;
+}
+
+static int mmu_audit_set(const char *val, const struct kernel_param *kp)
+{
+	int ret;
+	unsigned long enable;
+
+	ret = strict_strtoul(val, 10, &enable);
+	if (ret < 0)
+		return -EINVAL;
+
+	switch (enable) {
+	case 0:
+		mmu_audit_disable();
+		break;
+	case 1:
+		mmu_audit_enable();
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct kernel_param_ops audit_param_ops = {
+	.set = mmu_audit_set,
+	.get = param_get_bool,
+};
+
+module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
 #endif
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 3aab0f0..b60b4fd 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -195,6 +195,25 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
 
 	TP_ARGS(sp)
 );
+
+TRACE_EVENT(
+	kvm_mmu_audit,
+	TP_PROTO(struct kvm_vcpu *vcpu, int audit_point),
+	TP_ARGS(vcpu, audit_point),
+
+	TP_STRUCT__entry(
+		__field(struct kvm_vcpu *, vcpu)
+		__field(int, audit_point)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu = vcpu;
+		__entry->audit_point = audit_point;
+	),
+
+	TP_printk("vcpu:%d %s", __entry->vcpu->cpu,
+		  audit_point_name[__entry->audit_point])
+);
 #endif /* _TRACE_KVMMMU_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a0f2feb..debe770 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -542,7 +542,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
 
-	kvm_mmu_audit(vcpu, "pre page fault");
+	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
 	kvm_mmu_free_some_pages(vcpu);
 	sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
 			     level, &write_pt, pfn);
@@ -554,7 +554,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 		vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
 
 	++vcpu->stat.pf_fixed;
-	kvm_mmu_audit(vcpu, "post page fault (fixed)");
+	trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	return write_pt;
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v2 2/5] KVM: MMU: move audit to a separate file
  2010-08-30 10:22 ` [PATCH v2 1/5] KVM: MMU: support disable/enable mmu audit dynamicly Xiao Guangrong
@ 2010-08-30 10:24   ` Xiao Guangrong
  2010-08-30 10:25   ` [PATCH v2 3/5] KVM: MMU: improve active sp audit Xiao Guangrong
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 20+ messages in thread
From: Xiao Guangrong @ 2010-08-30 10:24 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, LKML, KVM

Move the audit code from arch/x86/kvm/mmu.c to arch/x86/kvm/mmu_audit.c

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
 arch/x86/kvm/mmu.c       |  279 +-------------------------------------------
 arch/x86/kvm/mmu_audit.c |  297 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 298 insertions(+), 278 deletions(-)
 create mode 100644 arch/x86/kvm/mmu_audit.c

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8b750ff..d2dad65 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3490,282 +3490,5 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
 EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
 
 #ifdef CONFIG_KVM_MMU_AUDIT
-static const char *audit_msg;
-
-typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
-
-static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
-			    inspect_spte_fn fn)
-{
-	int i;
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-		u64 ent = sp->spt[i];
-
-		if (is_shadow_present_pte(ent)) {
-			if (!is_last_spte(ent, sp->role.level)) {
-				struct kvm_mmu_page *child;
-				child = page_header(ent & PT64_BASE_ADDR_MASK);
-				__mmu_spte_walk(kvm, child, fn);
-			} else
-				fn(kvm, &sp->spt[i]);
-		}
-	}
-}
-
-static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
-{
-	int i;
-	struct kvm_mmu_page *sp;
-
-	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
-		return;
-	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
-		hpa_t root = vcpu->arch.mmu.root_hpa;
-		sp = page_header(root);
-		__mmu_spte_walk(vcpu->kvm, sp, fn);
-		return;
-	}
-	for (i = 0; i < 4; ++i) {
-		hpa_t root = vcpu->arch.mmu.pae_root[i];
-
-		if (root && VALID_PAGE(root)) {
-			root &= PT64_BASE_ADDR_MASK;
-			sp = page_header(root);
-			__mmu_spte_walk(vcpu->kvm, sp, fn);
-		}
-	}
-	return;
-}
-
-static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
-				gva_t va, int level)
-{
-	u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
-	int i;
-	gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
-		u64 *sptep = pt + i;
-		struct kvm_mmu_page *sp;
-		gfn_t gfn;
-		pfn_t pfn;
-		hpa_t hpa;
-
-		sp = page_header(__pa(sptep));
-
-		if (sp->unsync) {
-			if (level != PT_PAGE_TABLE_LEVEL) {
-				printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
-						audit_msg, sp, level);
-				return;
-			}
-
-			if (*sptep == shadow_notrap_nonpresent_pte) {
-				printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
-						audit_msg, sp);
-				return;
-			}
-		}
-
-		if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
-			printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
-					audit_msg, sp);
-			return;
-		}
-
-		if (!is_shadow_present_pte(*sptep) ||
-		      !is_last_spte(*sptep, level))
-			return;
-
-		gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
-		pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
-
-		if (is_error_pfn(pfn)) {
-			kvm_release_pfn_clean(pfn);
-			return;
-		}
-
-		hpa =  pfn << PAGE_SHIFT;
-
-		if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
-			printk(KERN_ERR "xx audit error: (%s) levels %d"
-					   " gva %lx pfn %llx hpa %llx ent %llxn",
-					   audit_msg, vcpu->arch.mmu.root_level,
-					   va, pfn, hpa, *sptep);
-	}
-}
-
-static void audit_mappings(struct kvm_vcpu *vcpu)
-{
-	unsigned i;
-
-	if (vcpu->arch.mmu.root_level == 4)
-		audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
-	else
-		for (i = 0; i < 4; ++i)
-			if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
-				audit_mappings_page(vcpu,
-						    vcpu->arch.mmu.pae_root[i],
-						    i << 30,
-						    2);
-}
-
-void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
-{
-	unsigned long *rmapp;
-	struct kvm_mmu_page *rev_sp;
-	gfn_t gfn;
-
-
-	rev_sp = page_header(__pa(sptep));
-	gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
-
-	if (!gfn_to_memslot(kvm, gfn)) {
-		if (!printk_ratelimit())
-			return;
-		printk(KERN_ERR "%s: no memslot for gfn %llx\n",
-				 audit_msg, gfn);
-		printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n",
-		       audit_msg, (long int)(sptep - rev_sp->spt),
-				rev_sp->gfn);
-		dump_stack();
-		return;
-	}
-
-	rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
-	if (!*rmapp) {
-		if (!printk_ratelimit())
-			return;
-		printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
-				 audit_msg, *sptep);
-		dump_stack();
-	}
-}
-
-void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
-{
-	mmu_spte_walk(vcpu, inspect_spte_has_rmap);
-}
-
-static void check_mappings_rmap(struct kvm_vcpu *vcpu)
-{
-	struct kvm_mmu_page *sp;
-	int i;
-
-	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
-		u64 *pt = sp->spt;
-
-		if (sp->role.level != PT_PAGE_TABLE_LEVEL)
-			continue;
-
-		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-			if (!is_rmap_spte(pt[i]))
-				continue;
-
-			inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
-		}
-	}
-	return;
-}---
 arch/x86/kvm/mmu.c       |  279 +-------------------------------------------
 arch/x86/kvm/mmu_audit.c |  297 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 298 insertions(+), 278 deletions(-)
 create mode 100644 arch/x86/kvm/mmu_audit.c

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8b750ff..d2dad65 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3490,282 +3490,5 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
 EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
 
 #ifdef CONFIG_KVM_MMU_AUDIT
-static const char *audit_msg;
-
-typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
-
-static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
-			    inspect_spte_fn fn)
-{
-	int i;
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-		u64 ent = sp->spt[i];
-
-		if (is_shadow_present_pte(ent)) {
-			if (!is_last_spte(ent, sp->role.level)) {
-				struct kvm_mmu_page *child;
-				child = page_header(ent & PT64_BASE_ADDR_MASK);
-				__mmu_spte_walk(kvm, child, fn);
-			} else
-				fn(kvm, &sp->spt[i]);
-		}
-	}
-}
-
-static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
-{
-	int i;
-	struct kvm_mmu_page *sp;
-
-	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
-		return;
-	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
-		hpa_t root = vcpu->arch.mmu.root_hpa;
-		sp = page_header(root);
-		__mmu_spte_walk(vcpu->kvm, sp, fn);
-		return;
-	}
-	for (i = 0; i < 4; ++i) {
-		hpa_t root = vcpu->arch.mmu.pae_root[i];
-
-		if (root && VALID_PAGE(root)) {
-			root &= PT64_BASE_ADDR_MASK;
-			sp = page_header(root);
-			__mmu_spte_walk(vcpu->kvm, sp, fn);
-		}
-	}
-	return;
-}
-
-static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
-				gva_t va, int level)
-{
-	u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
-	int i;
-	gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
-		u64 *sptep = pt + i;
-		struct kvm_mmu_page *sp;
-		gfn_t gfn;
-		pfn_t pfn;
-		hpa_t hpa;
-
-		sp = page_header(__pa(sptep));
-
-		if (sp->unsync) {
-			if (level != PT_PAGE_TABLE_LEVEL) {
-				printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
-						audit_msg, sp, level);
-				return;
-			}
-
-			if (*sptep == shadow_notrap_nonpresent_pte) {
-				printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
-						audit_msg, sp);
-				return;
-			}
-		}
-
-		if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
-			printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
-					audit_msg, sp);
-			return;
-		}
-
-		if (!is_shadow_present_pte(*sptep) ||
-		      !is_last_spte(*sptep, level))
-			return;
-
-		gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
-		pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
-
-		if (is_error_pfn(pfn)) {
-			kvm_release_pfn_clean(pfn);
-			return;
-		}
-
-		hpa =  pfn << PAGE_SHIFT;
-
-		if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
-			printk(KERN_ERR "xx audit error: (%s) levels %d"
-					   " gva %lx pfn %llx hpa %llx ent %llxn",
-					   audit_msg, vcpu->arch.mmu.root_level,
-					   va, pfn, hpa, *sptep);
-	}
-}
-
-static void audit_mappings(struct kvm_vcpu *vcpu)
-{
-	unsigned i;
-
-	if (vcpu->arch.mmu.root_level == 4)
-		audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
-	else
-		for (i = 0; i < 4; ++i)
-			if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
-				audit_mappings_page(vcpu,
-						    vcpu->arch.mmu.pae_root[i],
-						    i << 30,
-						    2);
-}
-
-void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
-{
-	unsigned long *rmapp;
-	struct kvm_mmu_page *rev_sp;
-	gfn_t gfn;
-
-
-	rev_sp = page_header(__pa(sptep));
-	gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
-
-	if (!gfn_to_memslot(kvm, gfn)) {
-		if (!printk_ratelimit())
-			return;
-		printk(KERN_ERR "%s: no memslot for gfn %llx\n",
-				 audit_msg, gfn);
-		printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n",
-		       audit_msg, (long int)(sptep - rev_sp->spt),
-				rev_sp->gfn);
-		dump_stack();
-		return;
-	}
-
-	rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
-	if (!*rmapp) {
-		if (!printk_ratelimit())
-			return;
-		printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
-				 audit_msg, *sptep);
-		dump_stack();
-	}
-}
-
-void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
-{
-	mmu_spte_walk(vcpu, inspect_spte_has_rmap);
-}
-
-static void check_mappings_rmap(struct kvm_vcpu *vcpu)
-{
-	struct kvm_mmu_page *sp;
-	int i;
-
-	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
-		u64 *pt = sp->spt;
-
-		if (sp->role.level != PT_PAGE_TABLE_LEVEL)
-			continue;
-
-		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-			if (!is_rmap_spte(pt[i]))
-				continue;
-
-			inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
-		}
-	}
-	return;
-}
-
-static void audit_rmap(struct kvm_vcpu *vcpu)
-{
-	check_mappings_rmap(vcpu);
-}
-
-static void audit_write_protection(struct kvm_vcpu *vcpu)
-{
-	struct kvm_mmu_page *sp;
-	struct kvm_memory_slot *slot;
-	unsigned long *rmapp;
-	u64 *spte;
-
-	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
-		if (sp->role.direct)
-			continue;
-		if (sp->unsync)
-			continue;
-		if (sp->role.invalid)
-			continue;
-
-		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
-		rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
-
-		spte = rmap_next(vcpu->kvm, rmapp, NULL);
-		while (spte) {
-			if (is_writable_pte(*spte))
-				printk(KERN_ERR "%s: (%s) shadow page has "
-				"writable mappings: gfn %llx role %x\n",
-			       __func__, audit_msg, sp->gfn,
-			       sp->role.word);
-			spte = rmap_next(vcpu->kvm, rmapp, spte);
-		}
-	}
-}
-
-static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
-{
-	audit_msg = audit_point_name[audit_point];
-	audit_rmap(vcpu);
-	audit_write_protection(vcpu);
-	if (strcmp("pre pte write", audit_msg) != 0)
-		audit_mappings(vcpu);
-	audit_sptes_have_rmaps(vcpu);
-}
-
-static bool mmu_audit;
-
-static void mmu_audit_enable(void)
-{
-	int ret;
-
-	if (mmu_audit)
-		return;
-
-	ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
-	WARN_ON(ret);
-
-	mmu_audit = true;
-}
-
-static void mmu_audit_disable(void)
-{
-	if (!mmu_audit)
-		return;
-
-	unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
-	tracepoint_synchronize_unregister();
-	mmu_audit = false;
-}
-
-static int mmu_audit_set(const char *val, const struct kernel_param *kp)
-{
-	int ret;
-	unsigned long enable;
-
-	ret = strict_strtoul(val, 10, &enable);
-	if (ret < 0)
-		return -EINVAL;
-
-	switch (enable) {
-	case 0:
-		mmu_audit_disable();
-		break;
-	case 1:
-		mmu_audit_enable();
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static struct kernel_param_ops audit_param_ops = {
-	.set = mmu_audit_set,
-	.get = param_get_bool,
-};
-
-module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
+#include "mmu_audit.c"
 #endif
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
new file mode 100644
index 0000000..fb8a461
--- /dev/null
+++ b/arch/x86/kvm/mmu_audit.c
@@ -0,0 +1,297 @@
+/*
+ * mmu_audit.c:
+ *
+ * Audit code for KVM MMU
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ *
+ * Authors:
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *   Avi Kivity   <avi@qumranet.com>
+ *   Marcelo Tosatti <mtosatti@redhat.com>
+ *   Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+static const char *audit_msg;
+
+typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
+
+static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
+			    inspect_spte_fn fn)
+{
+	int i;
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+		u64 ent = sp->spt[i];
+
+		if (is_shadow_present_pte(ent)) {
+			if (!is_last_spte(ent, sp->role.level)) {
+				struct kvm_mmu_page *child;
+				child = page_header(ent & PT64_BASE_ADDR_MASK);
+				__mmu_spte_walk(kvm, child, fn);
+			} else
+				fn(kvm, &sp->spt[i]);
+		}
+	}
+}
+
+static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
+{
+	int i;
+	struct kvm_mmu_page *sp;
+
+	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+		return;
+	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+		hpa_t root = vcpu->arch.mmu.root_hpa;
+		sp = page_header(root);
+		__mmu_spte_walk(vcpu->kvm, sp, fn);
+		return;
+	}
+	for (i = 0; i < 4; ++i) {
+		hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+		if (root && VALID_PAGE(root)) {
+			root &= PT64_BASE_ADDR_MASK;
+			sp = page_header(root);
+			__mmu_spte_walk(vcpu->kvm, sp, fn);
+		}
+	}
+	return;
+}
+
+static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
+				gva_t va, int level)
+{
+	u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
+	int i;
+	gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
+		u64 *sptep = pt + i;
+		struct kvm_mmu_page *sp;
+		gfn_t gfn;
+		pfn_t pfn;
+		hpa_t hpa;
+
+		sp = page_header(__pa(sptep));
+
+		if (sp->unsync) {
+			if (level != PT_PAGE_TABLE_LEVEL) {
+				printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
+						audit_msg, sp, level);
+				return;
+			}
+
+			if (*sptep == shadow_notrap_nonpresent_pte) {
+				printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
+						audit_msg, sp);
+				return;
+			}
+		}
+
+		if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
+			printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
+					audit_msg, sp);
+			return;
+		}
+
+		if (!is_shadow_present_pte(*sptep) ||
+		      !is_last_spte(*sptep, level))
+			return;
+
+		gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+		pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
+
+		if (is_error_pfn(pfn)) {
+			kvm_release_pfn_clean(pfn);
+			return;
+		}
+
+		hpa =  pfn << PAGE_SHIFT;
+
+		if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
+			printk(KERN_ERR "xx audit error: (%s) levels %d"
+					   " gva %lx pfn %llx hpa %llx ent %llxn",
+					   audit_msg, vcpu->arch.mmu.root_level,
+					   va, pfn, hpa, *sptep);
+	}
+}
+
+static void audit_mappings(struct kvm_vcpu *vcpu)
+{
+	unsigned i;
+
+	if (vcpu->arch.mmu.root_level == 4)
+		audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
+	else
+		for (i = 0; i < 4; ++i)
+			if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
+				audit_mappings_page(vcpu,
+						    vcpu->arch.mmu.pae_root[i],
+						    i << 30,
+						    2);
+}
+
+void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
+{
+	unsigned long *rmapp;
+	struct kvm_mmu_page *rev_sp;
+	gfn_t gfn;
+
+
+	rev_sp = page_header(__pa(sptep));
+	gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
+
+	if (!gfn_to_memslot(kvm, gfn)) {
+		if (!printk_ratelimit())
+			return;
+		printk(KERN_ERR "%s: no memslot for gfn %llx\n",
+				 audit_msg, gfn);
+		printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n",
+		       audit_msg, (long int)(sptep - rev_sp->spt),
+				rev_sp->gfn);
+		dump_stack();
+		return;
+	}
+
+	rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
+	if (!*rmapp) {
+		if (!printk_ratelimit())
+			return;
+		printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
+				 audit_msg, *sptep);
+		dump_stack();
+	}
+}
+
+void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
+{
+	mmu_spte_walk(vcpu, inspect_spte_has_rmap);
+}
+
+static void check_mappings_rmap(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu_page *sp;
+	int i;
+
+	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
+		u64 *pt = sp->spt;
+
+		if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+			continue;
+
+		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+			if (!is_rmap_spte(pt[i]))
+				continue;
+
+			inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
+		}
+	}
+	return;
+}
+
+static void audit_rmap(struct kvm_vcpu *vcpu)
+{
+	check_mappings_rmap(vcpu);
+}
+
+static void audit_write_protection(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu_page *sp;
+	struct kvm_memory_slot *slot;
+	unsigned long *rmapp;
+	u64 *spte;
+
+	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
+		if (sp->role.direct)
+			continue;
+		if (sp->unsync)
+			continue;
+		if (sp->role.invalid)
+			continue;
+
+		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
+		rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
+
+		spte = rmap_next(vcpu->kvm, rmapp, NULL);
+		while (spte) {
+			if (is_writable_pte(*spte))
+				printk(KERN_ERR "%s: (%s) shadow page has "
+				"writable mappings: gfn %llx role %x\n",
+			       __func__, audit_msg, sp->gfn,
+			       sp->role.word);
+			spte = rmap_next(vcpu->kvm, rmapp, spte);
+		}
+	}
+}
+
+static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
+{
+	audit_msg = audit_point_name[audit_point];
+	audit_rmap(vcpu);
+	audit_write_protection(vcpu);
+	if (strcmp("pre pte write", audit_msg) != 0)
+		audit_mappings(vcpu);
+	audit_sptes_have_rmaps(vcpu);
+}
+
+static bool mmu_audit;
+
+static void mmu_audit_enable(void)
+{
+	int ret;
+
+	if (mmu_audit)
+		return;
+
+	ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+	WARN_ON(ret);
+
+	mmu_audit = true;
+}
+
+static void mmu_audit_disable(void)
+{
+	if (!mmu_audit)
+		return;
+
+	unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+	tracepoint_synchronize_unregister();
+	mmu_audit = false;
+}
+
+static int mmu_audit_set(const char *val, const struct kernel_param *kp)
+{
+	int ret;
+	unsigned long enable;
+
+	ret = strict_strtoul(val, 10, &enable);
+	if (ret < 0)
+		return -EINVAL;
+
+	switch (enable) {
+	case 0:
+		mmu_audit_disable();
+		break;
+	case 1:
+		mmu_audit_enable();
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct kernel_param_ops audit_param_ops = {
+	.set = mmu_audit_set,
+	.get = param_get_bool,
+};
+
+module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
-- 
1.7.0.4

-
-static void audit_rmap(struct kvm_vcpu *vcpu)
-{
-	check_mappings_rmap(vcpu);
-}
-
-static void audit_write_protection(struct kvm_vcpu *vcpu)
-{
-	struct kvm_mmu_page *sp;
-	struct kvm_memory_slot *slot;
-	unsigned long *rmapp;
-	u64 *spte;
-
-	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
-		if (sp->role.direct)
-			continue;
-		if (sp->unsync)
-			continue;
-		if (sp->role.invalid)
-			continue;
-
-		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
-		rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
-
-		spte = rmap_next(vcpu->kvm, rmapp, NULL);
-		while (spte) {
-			if (is_writable_pte(*spte))
-				printk(KERN_ERR "%s: (%s) shadow page has "
-				"writable mappings: gfn %llx role %x\n",
-			       __func__, audit_msg, sp->gfn,
-			       sp->role.word);
-			spte = rmap_next(vcpu->kvm, rmapp, spte);
-		}
-	}
-}
-
-static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
-{
-	audit_msg = audit_point_name[audit_point];
-	audit_rmap(vcpu);
-	audit_write_protection(vcpu);
-	if (strcmp("pre pte write", audit_msg) != 0)
-		audit_mappings(vcpu);
-	audit_sptes_have_rmaps(vcpu);
-}
-
-static bool mmu_audit;
-
-static void mmu_audit_enable(void)
-{
-	int ret;
-
-	if (mmu_audit)
-		return;
-
-	ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
-	WARN_ON(ret);
-
-	mmu_audit = true;
-}
-
-static void mmu_audit_disable(void)
-{
-	if (!mmu_audit)
-		return;
-
-	unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
-	tracepoint_synchronize_unregister();
-	mmu_audit = false;
-}
-
-static int mmu_audit_set(const char *val, const struct kernel_param *kp)
-{
-	int ret;
-	unsigned long enable;
-
-	ret = strict_strtoul(val, 10, &enable);
-	if (ret < 0)
-		return -EINVAL;
-
-	switch (enable) {
-	case 0:
-		mmu_audit_disable();
-		break;
-	case 1:
-		mmu_audit_enable();
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static struct kernel_param_ops audit_param_ops = {
-	.set = mmu_audit_set,
-	.get = param_get_bool,
-};
-
-module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
+#include "mmu_audit.c"
 #endif
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
new file mode 100644
index 0000000..fb8a461
--- /dev/null
+++ b/arch/x86/kvm/mmu_audit.c
@@ -0,0 +1,297 @@
+/*
+ * mmu_audit.c:
+ *
+ * Audit code for KVM MMU
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ *
+ * Authors:
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *   Avi Kivity   <avi@qumranet.com>
+ *   Marcelo Tosatti <mtosatti@redhat.com>
+ *   Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+static const char *audit_msg;
+
+typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
+
+static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
+			    inspect_spte_fn fn)
+{
+	int i;
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+		u64 ent = sp->spt[i];
+
+		if (is_shadow_present_pte(ent)) {
+			if (!is_last_spte(ent, sp->role.level)) {
+				struct kvm_mmu_page *child;
+				child = page_header(ent & PT64_BASE_ADDR_MASK);
+				__mmu_spte_walk(kvm, child, fn);
+			} else
+				fn(kvm, &sp->spt[i]);
+		}
+	}
+}
+
+static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
+{
+	int i;
+	struct kvm_mmu_page *sp;
+
+	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+		return;
+	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+		hpa_t root = vcpu->arch.mmu.root_hpa;
+		sp = page_header(root);
+		__mmu_spte_walk(vcpu->kvm, sp, fn);
+		return;
+	}
+	for (i = 0; i < 4; ++i) {
+		hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+		if (root && VALID_PAGE(root)) {
+			root &= PT64_BASE_ADDR_MASK;
+			sp = page_header(root);
+			__mmu_spte_walk(vcpu->kvm, sp, fn);
+		}
+	}
+	return;
+}
+
+static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
+				gva_t va, int level)
+{
+	u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
+	int i;
+	gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
+		u64 *sptep = pt + i;
+		struct kvm_mmu_page *sp;
+		gfn_t gfn;
+		pfn_t pfn;
+		hpa_t hpa;
+
+		sp = page_header(__pa(sptep));
+
+		if (sp->unsync) {
+			if (level != PT_PAGE_TABLE_LEVEL) {
+				printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
+						audit_msg, sp, level);
+				return;
+			}
+
+			if (*sptep == shadow_notrap_nonpresent_pte) {
+				printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
+						audit_msg, sp);
+				return;
+			}
+		}
+
+		if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
+			printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
+					audit_msg, sp);
+			return;
+		}
+
+		if (!is_shadow_present_pte(*sptep) ||
+		      !is_last_spte(*sptep, level))
+			return;
+
+		gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+		pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
+
+		if (is_error_pfn(pfn)) {
+			kvm_release_pfn_clean(pfn);
+			return;
+		}
+
+		hpa =  pfn << PAGE_SHIFT;
+
+		if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
+			printk(KERN_ERR "xx audit error: (%s) levels %d"
+					   " gva %lx pfn %llx hpa %llx ent %llxn",
+					   audit_msg, vcpu->arch.mmu.root_level,
+					   va, pfn, hpa, *sptep);
+	}
+}
+
+static void audit_mappings(struct kvm_vcpu *vcpu)
+{
+	unsigned i;
+
+	if (vcpu->arch.mmu.root_level == 4)
+		audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
+	else
+		for (i = 0; i < 4; ++i)
+			if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
+				audit_mappings_page(vcpu,
+						    vcpu->arch.mmu.pae_root[i],
+						    i << 30,
+						    2);
+}
+
+void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
+{
+	unsigned long *rmapp;
+	struct kvm_mmu_page *rev_sp;
+	gfn_t gfn;
+
+
+	rev_sp = page_header(__pa(sptep));
+	gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
+
+	if (!gfn_to_memslot(kvm, gfn)) {
+		if (!printk_ratelimit())
+			return;
+		printk(KERN_ERR "%s: no memslot for gfn %llx\n",
+				 audit_msg, gfn);
+		printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n",
+		       audit_msg, (long int)(sptep - rev_sp->spt),
+				rev_sp->gfn);
+		dump_stack();
+		return;
+	}
+
+	rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
+	if (!*rmapp) {
+		if (!printk_ratelimit())
+			return;
+		printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
+				 audit_msg, *sptep);
+		dump_stack();
+	}
+}
+
+void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
+{
+	mmu_spte_walk(vcpu, inspect_spte_has_rmap);
+}
+
+static void check_mappings_rmap(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu_page *sp;
+	int i;
+
+	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
+		u64 *pt = sp->spt;
+
+		if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+			continue;
+
+		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+			if (!is_rmap_spte(pt[i]))
+				continue;
+
+			inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
+		}
+	}
+	return;
+}
+
+static void audit_rmap(struct kvm_vcpu *vcpu)
+{
+	check_mappings_rmap(vcpu);
+}
+
+static void audit_write_protection(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu_page *sp;
+	struct kvm_memory_slot *slot;
+	unsigned long *rmapp;
+	u64 *spte;
+
+	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
+		if (sp->role.direct)
+			continue;
+		if (sp->unsync)
+			continue;
+		if (sp->role.invalid)
+			continue;
+
+		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
+		rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
+
+		spte = rmap_next(vcpu->kvm, rmapp, NULL);
+		while (spte) {
+			if (is_writable_pte(*spte))
+				printk(KERN_ERR "%s: (%s) shadow page has "
+				"writable mappings: gfn %llx role %x\n",
+			       __func__, audit_msg, sp->gfn,
+			       sp->role.word);
+			spte = rmap_next(vcpu->kvm, rmapp, spte);
+		}
+	}
+}
+
+static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
+{
+	audit_msg = audit_point_name[audit_point];
+	audit_rmap(vcpu);
+	audit_write_protection(vcpu);
+	if (strcmp("pre pte write", audit_msg) != 0)
+		audit_mappings(vcpu);
+	audit_sptes_have_rmaps(vcpu);
+}
+
+static bool mmu_audit;
+
+static void mmu_audit_enable(void)
+{
+	int ret;
+
+	if (mmu_audit)
+		return;
+
+	ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+	WARN_ON(ret);
+
+	mmu_audit = true;
+}
+
+static void mmu_audit_disable(void)
+{
+	if (!mmu_audit)
+		return;
+
+	unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+	tracepoint_synchronize_unregister();
+	mmu_audit = false;
+}
+
+static int mmu_audit_set(const char *val, const struct kernel_param *kp)
+{
+	int ret;
+	unsigned long enable;
+
+	ret = strict_strtoul(val, 10, &enable);
+	if (ret < 0)
+		return -EINVAL;
+
+	switch (enable) {
+	case 0:
+		mmu_audit_disable();
+		break;
+	case 1:
+		mmu_audit_enable();
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct kernel_param_ops audit_param_ops = {
+	.set = mmu_audit_set,
+	.get = param_get_bool,
+};
+
+module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v2 3/5] KVM: MMU: improve active sp audit
  2010-08-30 10:22 ` [PATCH v2 1/5] KVM: MMU: support disable/enable mmu audit dynamicly Xiao Guangrong
  2010-08-30 10:24   ` [PATCH v2 2/5] KVM: MMU: move audit to a separate file Xiao Guangrong
@ 2010-08-30 10:25   ` Xiao Guangrong
  2010-08-30 10:25   ` [PATCH v2 4/5] KVM: MMU: improve spte audit Xiao Guangrong
  2010-08-30 10:26   ` [PATCH v2 5/5] KVM: MMU: lower the aduit frequency Xiao Guangrong
  3 siblings, 0 replies; 20+ messages in thread
From: Xiao Guangrong @ 2010-08-30 10:25 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, LKML, KVM

Both audit_rmap() and audit_write_protection() need to walk all active sp, so we can do
these checking in a sp walking

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
 arch/x86/kvm/mmu_audit.c |   74 +++++++++++++++++++++++----------------------
 1 files changed, 38 insertions(+), 36 deletions(-)

diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index fb8a461..8becb86 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -65,6 +65,16 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
 	return;
 }
 
+typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp);
+
+static void walk_all_active_sps(struct kvm *kvm, sp_handler fn)
+{
+	struct kvm_mmu_page *sp;
+
+	list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link)
+		fn(kvm, sp);
+}
+
 static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
 				gva_t va, int level)
 {
@@ -175,67 +185,59 @@ void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
 	mmu_spte_walk(vcpu, inspect_spte_has_rmap);
 }
 
-static void check_mappings_rmap(struct kvm_vcpu *vcpu)
+static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
-	struct kvm_mmu_page *sp;
 	int i;
 
-	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
-		u64 *pt = sp->spt;
+	if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+		return;
 
-		if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+		if (!is_rmap_spte(sp->spt[i]))
 			continue;
 
-		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-			if (!is_rmap_spte(pt[i]))
-				continue;
-
-			inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
-		}
+		inspect_spte_has_rmap(kvm, sp->spt + i);
 	}
-	return;
 }
 
-static void audit_rmap(struct kvm_vcpu *vcpu)
+void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
-	check_mappings_rmap(vcpu);
-}
-
-static void audit_write_protection(struct kvm_vcpu *vcpu)
-{
-	struct kvm_mmu_page *sp;
 	struct kvm_memory_slot *slot;
 	unsigned long *rmapp;
 	u64 *spte;
 
-	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
-		if (sp->role.direct)
-			continue;
-		if (sp->unsync)
-			continue;
-		if (sp->role.invalid)
-			continue;
+	if (sp->role.direct || sp->unsync || sp->role.invalid)
+		return;
 
-		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
-		rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
+	slot = gfn_to_memslot(kvm, sp->gfn);
+	rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
 
-		spte = rmap_next(vcpu->kvm, rmapp, NULL);
-		while (spte) {
-			if (is_writable_pte(*spte))
-				printk(KERN_ERR "%s: (%s) shadow page has "
+	spte = rmap_next(kvm, rmapp, NULL);
+	while (spte) {
+		if (is_writable_pte(*spte))
+			printk(KERN_ERR "%s: (%s) shadow page has "
 				"writable mappings: gfn %llx role %x\n",
 			       __func__, audit_msg, sp->gfn,
 			       sp->role.word);
-			spte = rmap_next(vcpu->kvm, rmapp, spte);
-		}
+		spte = rmap_next(kvm, rmapp, spte);
 	}
 }
 
+static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+	check_mappings_rmap(kvm, sp);
+	audit_write_protection(kvm, sp);
+}
+
+static void audit_all_active_sps(struct kvm *kvm)
+{
+	walk_all_active_sps(kvm, audit_sp);
+}
+
 static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
 {
 	audit_msg = audit_point_name[audit_point];
-	audit_rmap(vcpu);
-	audit_write_protection(vcpu);
+	audit_all_active_sps(vcpu->kvm);
 	if (strcmp("pre pte write", audit_msg) != 0)
 		audit_mappings(vcpu);
 	audit_sptes_have_rmaps(vcpu);
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v2 4/5] KVM: MMU: improve spte audit
  2010-08-30 10:22 ` [PATCH v2 1/5] KVM: MMU: support disable/enable mmu audit dynamicly Xiao Guangrong
  2010-08-30 10:24   ` [PATCH v2 2/5] KVM: MMU: move audit to a separate file Xiao Guangrong
  2010-08-30 10:25   ` [PATCH v2 3/5] KVM: MMU: improve active sp audit Xiao Guangrong
@ 2010-08-30 10:25   ` Xiao Guangrong
  2010-08-30 10:26   ` [PATCH v2 5/5] KVM: MMU: lower the aduit frequency Xiao Guangrong
  3 siblings, 0 replies; 20+ messages in thread
From: Xiao Guangrong @ 2010-08-30 10:25 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, LKML, KVM

Both audit_mappings() and audit_sptes_have_rmaps() need to walk vcpu's page table, so we can do
these checking in a spte walking

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
 arch/x86/kvm/mmu_audit.c |  148 +++++++++++++++++++++------------------------
 1 files changed, 69 insertions(+), 79 deletions(-)

diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 8becb86..3bde186 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -19,23 +19,24 @@
 
 static const char *audit_msg;
 
-typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
+typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
 
-static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
-			    inspect_spte_fn fn)
+static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+			    inspect_spte_fn fn, int level)
 {
 	int i;
 
 	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-		u64 ent = sp->spt[i];
-
-		if (is_shadow_present_pte(ent)) {
-			if (!is_last_spte(ent, sp->role.level)) {
-				struct kvm_mmu_page *child;
-				child = page_header(ent & PT64_BASE_ADDR_MASK);
-				__mmu_spte_walk(kvm, child, fn);
-			} else
-				fn(kvm, &sp->spt[i]);
+		u64 *ent = sp->spt;
+
+		fn(vcpu, ent + i, level);
+
+		if (is_shadow_present_pte(ent[i]) &&
+		      !is_last_spte(ent[i], level)) {
+			struct kvm_mmu_page *child;
+
+			child = page_header(ent[i] & PT64_BASE_ADDR_MASK);
+			__mmu_spte_walk(vcpu, child, fn, level - 1);
 		}
 	}
 }
@@ -47,21 +48,25 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
 
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return;
+
 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
+
 		sp = page_header(root);
-		__mmu_spte_walk(vcpu->kvm, sp, fn);
+		__mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_LEVEL);
 		return;
 	}
+
 	for (i = 0; i < 4; ++i) {
 		hpa_t root = vcpu->arch.mmu.pae_root[i];
 
 		if (root && VALID_PAGE(root)) {
 			root &= PT64_BASE_ADDR_MASK;
 			sp = page_header(root);
-			__mmu_spte_walk(vcpu->kvm, sp, fn);
+			__mmu_spte_walk(vcpu, sp, fn, 2);
 		}
 	}
+
 	return;
 }
 
@@ -75,80 +80,55 @@ static void walk_all_active_sps(struct kvm *kvm, sp_handler fn)
 		fn(kvm, sp);
 }
 
-static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
-				gva_t va, int level)
+static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 {
-	u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
-	int i;
-	gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
-		u64 *sptep = pt + i;
-		struct kvm_mmu_page *sp;
-		gfn_t gfn;
-		pfn_t pfn;
-		hpa_t hpa;
-
-		sp = page_header(__pa(sptep));
-
-		if (sp->unsync) {
-			if (level != PT_PAGE_TABLE_LEVEL) {
-				printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
-						audit_msg, sp, level);
-				return;
-			}
-
-			if (*sptep == shadow_notrap_nonpresent_pte) {
-				printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
-						audit_msg, sp);
-				return;
-			}
-		}
+	struct kvm_mmu_page *sp;
+	gfn_t gfn;
+	pfn_t pfn;
+	hpa_t hpa;
 
-		if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
-			printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
-					audit_msg, sp);
+	sp = page_header(__pa(sptep));
+
+	if (sp->unsync) {
+		if (level != PT_PAGE_TABLE_LEVEL) {
+			printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
+				audit_msg, sp, level);
 			return;
 		}
 
-		if (!is_shadow_present_pte(*sptep) ||
-		      !is_last_spte(*sptep, level))
+		if (*sptep == shadow_notrap_nonpresent_pte) {
+			printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
+				audit_msg, sp);
 			return;
+		}
+	}
 
-		gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
-		pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
+	if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
+		printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
+			audit_msg, sp);
+		return;
+	}
 
-		if (is_error_pfn(pfn)) {
-			kvm_release_pfn_clean(pfn);
-			return;
-		}
+	if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level))
+		return;
 
-		hpa =  pfn << PAGE_SHIFT;
+	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+	pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
 
-		if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
-			printk(KERN_ERR "xx audit error: (%s) levels %d"
-					   " gva %lx pfn %llx hpa %llx ent %llxn",
-					   audit_msg, vcpu->arch.mmu.root_level,
-					   va, pfn, hpa, *sptep);
+	if (is_error_pfn(pfn)) {
+		kvm_release_pfn_clean(pfn);
+		return;
 	}
-}
 
-static void audit_mappings(struct kvm_vcpu *vcpu)
-{
-	unsigned i;
-
-	if (vcpu->arch.mmu.root_level == 4)
-		audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
-	else
-		for (i = 0; i < 4; ++i)
-			if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
-				audit_mappings_page(vcpu,
-						    vcpu->arch.mmu.pae_root[i],
-						    i << 30,
-						    2);
+	hpa =  pfn << PAGE_SHIFT;
+	if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
+		printk(KERN_ERR "xx audit error: (%s) levels %d"
+				   "pfn %llx hpa %llx ent %llxn",
+				   audit_msg, vcpu->arch.mmu.root_level,
+				   pfn, hpa, *sptep);
 }
 
-void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
+static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 {
 	unsigned long *rmapp;
 	struct kvm_mmu_page *rev_sp;
@@ -180,9 +160,10 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 	}
 }
 
-void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
+static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 {
-	mmu_spte_walk(vcpu, inspect_spte_has_rmap);
+	if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level))
+		inspect_spte_has_rmap(vcpu->kvm, sptep);
 }
 
 static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -234,13 +215,22 @@ static void audit_all_active_sps(struct kvm *kvm)
 	walk_all_active_sps(kvm, audit_sp);
 }
 
+static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level)
+{
+	audit_sptes_have_rmaps(vcpu, sptep, level);
+	audit_mappings(vcpu, sptep, level);
+}
+
+static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
+{
+	mmu_spte_walk(vcpu, audit_spte);
+}
+
 static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
 {
 	audit_msg = audit_point_name[audit_point];
 	audit_all_active_sps(vcpu->kvm);
-	if (strcmp("pre pte write", audit_msg) != 0)
-		audit_mappings(vcpu);
-	audit_sptes_have_rmaps(vcpu);
+	audit_vcpu_spte(vcpu);
 }
 
 static bool mmu_audit;
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v2 5/5] KVM: MMU: lower the aduit frequency
  2010-08-30 10:22 ` [PATCH v2 1/5] KVM: MMU: support disable/enable mmu audit dynamicly Xiao Guangrong
                     ` (2 preceding siblings ...)
  2010-08-30 10:25   ` [PATCH v2 4/5] KVM: MMU: improve spte audit Xiao Guangrong
@ 2010-08-30 10:26   ` Xiao Guangrong
  2010-08-30 15:47     ` Marcelo Tosatti
  3 siblings, 1 reply; 20+ messages in thread
From: Xiao Guangrong @ 2010-08-30 10:26 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, LKML, KVM

The audit is very high overhead, so we need lower the frequency to assure the guest running

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
 arch/x86/kvm/mmu_audit.c |    7 +++++++
 1 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 3bde186..bd2b1be 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -17,6 +17,8 @@
  *
  */
 
+#include <linux/ratelimit.h>
+
 static const char *audit_msg;
 
 typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
@@ -228,6 +230,11 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
 
 static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
 {
+	static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
+
+	if (!__ratelimit(&ratelimit_state))
+		return;
+
 	audit_msg = audit_point_name[audit_point];
 	audit_all_active_sps(vcpu->kvm);
 	audit_vcpu_spte(vcpu);
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 5/5] KVM: MMU: lower the aduit frequency
  2010-08-30 10:26   ` [PATCH v2 5/5] KVM: MMU: lower the aduit frequency Xiao Guangrong
@ 2010-08-30 15:47     ` Marcelo Tosatti
  2010-08-31  2:27       ` Xiao Guangrong
  0 siblings, 1 reply; 20+ messages in thread
From: Marcelo Tosatti @ 2010-08-30 15:47 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, LKML, KVM

On Mon, Aug 30, 2010 at 06:26:33PM +0800, Xiao Guangrong wrote:
> The audit is very high overhead, so we need lower the frequency to assure the guest running
> 
> Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
> ---
>  arch/x86/kvm/mmu_audit.c |    7 +++++++
>  1 files changed, 7 insertions(+), 0 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
> index 3bde186..bd2b1be 100644
> --- a/arch/x86/kvm/mmu_audit.c
> +++ b/arch/x86/kvm/mmu_audit.c
> @@ -17,6 +17,8 @@
>   *
>   */
>  
> +#include <linux/ratelimit.h>
> +
>  static const char *audit_msg;
>  
>  typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
> @@ -228,6 +230,11 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
>  
>  static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
>  {
> +	static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
> +
> +	if (!__ratelimit(&ratelimit_state))
> +		return;
> +
>  	audit_msg = audit_point_name[audit_point];
>  	audit_all_active_sps(vcpu->kvm);
>  	audit_vcpu_spte(vcpu);
> -- 
> 1.7.0.4

Well, as Avi said this makes it difficult to trace back to offender (the
audit points are placed around modifications to the shadow page tree 
for that reason).

I've always seen progress from the guest while running with audit
enabled (its slow, but its not supposed to be fast anyway). 

Did you experience a freeze? 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 5/5] KVM: MMU: lower the aduit frequency
  2010-08-30 15:47     ` Marcelo Tosatti
@ 2010-08-31  2:27       ` Xiao Guangrong
  2010-09-01  9:06         ` Avi Kivity
  0 siblings, 1 reply; 20+ messages in thread
From: Xiao Guangrong @ 2010-08-31  2:27 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Avi Kivity, LKML, KVM

On 08/30/2010 11:47 PM, Marcelo Tosatti wrote:
> On Mon, Aug 30, 2010 at 06:26:33PM +0800, Xiao Guangrong wrote:
>> The audit is very high overhead, so we need lower the frequency to assure the guest running
>>
>> Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
>> ---
>>  arch/x86/kvm/mmu_audit.c |    7 +++++++
>>  1 files changed, 7 insertions(+), 0 deletions(-)
>>
>> diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
>> index 3bde186..bd2b1be 100644
>> --- a/arch/x86/kvm/mmu_audit.c
>> +++ b/arch/x86/kvm/mmu_audit.c
>> @@ -17,6 +17,8 @@
>>   *
>>   */
>>  
>> +#include <linux/ratelimit.h>
>> +
>>  static const char *audit_msg;
>>  
>>  typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
>> @@ -228,6 +230,11 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
>>  
>>  static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
>>  {
>> +	static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
>> +
>> +	if (!__ratelimit(&ratelimit_state))
>> +		return;
>> +
>>  	audit_msg = audit_point_name[audit_point];
>>  	audit_all_active_sps(vcpu->kvm);
>>  	audit_vcpu_spte(vcpu);
>> -- 
>> 1.7.0.4
> 
> Well, as Avi said this makes it difficult to trace back to offender (the
> audit points are placed around modifications to the shadow page tree 
> for that reason).
> 

Yeah. it's the best way that not rate limit it, but...

> I've always seen progress from the guest while running with audit
> enabled (its slow, but its not supposed to be fast anyway). 
> 
> Did you experience a freeze? 
> 

There is a simply test in the guest if it's not rate limit:

# time ls
anaconda-ks.cfg  Documents  install.log         Music     Public     Videos
Desktop          Downloads  install.log.syslog  Pictures  Templates

real    1m26.053s
user    0m0.311s
sys     0m1.813s

'ls' command cost about 1.5 minute, if we run the memory test program, i think
the time/delay is unacceptable...... :-(

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 5/5] KVM: MMU: lower the aduit frequency
  2010-08-31  2:27       ` Xiao Guangrong
@ 2010-09-01  9:06         ` Avi Kivity
  2010-09-01 16:27           ` Marcelo Tosatti
  0 siblings, 1 reply; 20+ messages in thread
From: Avi Kivity @ 2010-09-01  9:06 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Marcelo Tosatti, LKML, KVM

  On 08/31/2010 05:27 AM, Xiao Guangrong wrote:
>
>> I've always seen progress from the guest while running with audit
>> enabled (its slow, but its not supposed to be fast anyway).
>>
>> Did you experience a freeze?
>>
> There is a simply test in the guest if it's not rate limit:
>
> # time ls
> anaconda-ks.cfg  Documents  install.log         Music     Public     Videos
> Desktop          Downloads  install.log.syslog  Pictures  Templates
>
> real    1m26.053s
> user    0m0.311s
> sys     0m1.813s
>
> 'ls' command cost about 1.5 minute, if we run the memory test program, i think
> the time/delay is unacceptable...... :-(

Marcelo, would making the ratelimit optional help?  personally I think 
without ratelimit audit is useless, and with ratelimit it is a lot less 
useful but can still point out problems.  But I haven't used it in a while.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 5/5] KVM: MMU: lower the aduit frequency
  2010-09-01  9:06         ` Avi Kivity
@ 2010-09-01 16:27           ` Marcelo Tosatti
  2010-09-02  8:30             ` Avi Kivity
  0 siblings, 1 reply; 20+ messages in thread
From: Marcelo Tosatti @ 2010-09-01 16:27 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Xiao Guangrong, LKML, KVM

On Wed, Sep 01, 2010 at 12:06:37PM +0300, Avi Kivity wrote:
>  On 08/31/2010 05:27 AM, Xiao Guangrong wrote:
> >
> >>I've always seen progress from the guest while running with audit
> >>enabled (its slow, but its not supposed to be fast anyway).
> >>
> >>Did you experience a freeze?
> >>
> >There is a simply test in the guest if it's not rate limit:
> >
> ># time ls
> >anaconda-ks.cfg  Documents  install.log         Music     Public     Videos
> >Desktop          Downloads  install.log.syslog  Pictures  Templates
> >
> >real    1m26.053s
> >user    0m0.311s
> >sys     0m1.813s
> >
> >'ls' command cost about 1.5 minute, if we run the memory test program, i think
> >the time/delay is unacceptable...... :-(
> 
> Marcelo, would making the ratelimit optional help?  personally I
> think without ratelimit audit is useless, and with ratelimit it is a
> lot less useful but can still point out problems.  But I haven't
> used it in a while.

Was just wondering whether there was a freeze. Patchset looks good
(ratelimit can be disabled manually).

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 5/5] KVM: MMU: lower the aduit frequency
  2010-09-01 16:27           ` Marcelo Tosatti
@ 2010-09-02  8:30             ` Avi Kivity
  0 siblings, 0 replies; 20+ messages in thread
From: Avi Kivity @ 2010-09-02  8:30 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Xiao Guangrong, LKML, KVM

  On 09/01/2010 07:27 PM, Marcelo Tosatti wrote:
>
> Was just wondering whether there was a freeze. Patchset looks good
> (ratelimit can be disabled manually).
>

Ok, applied now.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2010-09-02  8:30 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-08-28 11:58 [PATCH 0/4] KVM: MMU: mmu audit code improved Xiao Guangrong
2010-08-28 12:00 ` [PATCH 1/4] KVM: MMU: support disable/enable mmu audit dynamicly Xiao Guangrong
2010-08-29  9:16   ` Avi Kivity
2010-08-30  1:58     ` Xiao Guangrong
2010-08-28 12:01 ` [PATCH 2/4] KVM: MMU: improve active sp audit Xiao Guangrong
2010-08-28 12:02 ` [PATCH 3/4] KVM: MMU: improve spte audit Xiao Guangrong
2010-08-28 12:03 ` [PATCH 4/4] KVM: MMU: lower the aduit frequency Xiao Guangrong
2010-08-29  9:19   ` Avi Kivity
2010-08-30  2:16     ` Xiao Guangrong
2010-08-30  6:59       ` Avi Kivity
2010-08-30 10:22 ` [PATCH v2 1/5] KVM: MMU: support disable/enable mmu audit dynamicly Xiao Guangrong
2010-08-30 10:24   ` [PATCH v2 2/5] KVM: MMU: move audit to a separate file Xiao Guangrong
2010-08-30 10:25   ` [PATCH v2 3/5] KVM: MMU: improve active sp audit Xiao Guangrong
2010-08-30 10:25   ` [PATCH v2 4/5] KVM: MMU: improve spte audit Xiao Guangrong
2010-08-30 10:26   ` [PATCH v2 5/5] KVM: MMU: lower the aduit frequency Xiao Guangrong
2010-08-30 15:47     ` Marcelo Tosatti
2010-08-31  2:27       ` Xiao Guangrong
2010-09-01  9:06         ` Avi Kivity
2010-09-01 16:27           ` Marcelo Tosatti
2010-09-02  8:30             ` Avi Kivity

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).