[RFC] kvm: Adding skelaton for Memory ROE

* [RFC] kvm: Adding skelaton for Memory ROE
@ 2018-07-16 12:12 Ahmed Abd El Mawgood
  2018-07-16 19:35 ` Ahmed Abd El Mawgood
  0 siblings, 1 reply; 3+ messages in thread
From: Ahmed Abd El Mawgood @ 2018-07-16 12:12 UTC (permalink / raw)
  To: Paolo Bonzini, Radim Krčmář,
	Thomas Gleixner, Ingo Molnar, H. Peter Anvin, x86, kvm,
	linux-kernel, Rik van Riel
  Cc: Ahmed Abd El Mawgood

This is my first patch, an attempt to implement Memory ROE discussed by me
earlier as a way to prevent Rootkits. I have already explained in details
in this thread:
https://www.mail-archive.com/kernelnewbies@kernelnewbies.org/msg18826.html
So I think there is no need for saying the exact same thing again.
The problem is that the code isn't working and I can't figure out why

I tried implementing the protection to follow similar behavior to that
of KVM_MEM_READONLY but to be on page (SPTE) level
The current problem I am facing is that when handling the hypercall
vcpu->mode turns to be OUTSIDE_GUEST_MODE but KVM_REQ_TLB_FLUSH doesn't
seem to be handled correctly. KVM documentation promised that when VCPU is
not in GUEST_MODE VCPU are handled asap and kvm_vcpu_kick(vcpu); will
even force that, but it doesn't seem to be the case for me. This is the
kind of logging I am getting:

[3556.312299] kvm_mmu_slot_apply_flags: visited
[3556.312301] kvm_mmu_slot_apply_write_access: Flush = false
[3557.034243] gfn_is_readonly: test_bit = 0
[3557.034251] gfn_is_readonly: test_bit = 0
[3557.034254] gfn_is_readonly: test_bit = 0
[3557.034463] Hypercall received, page address 0x0
[3557.034466] gfn_is_readonly: test_bit = 0
[3557.034469] kvm_mroe: flush state = Done
[3557.034472] kvm_mroe: cpu mode = OUTSIDE_GUEST_MODE
[3557.034475] Setting page number 0 in slot number 0
[3557.034480] slot_rmap_apply_protection: The 0th page is readonly, Flush = True
[3557.034483] kvm_mmu_slot_apply_write_access: Flush = true
[3557.034486] kvm_mroe: cpu mode = OUTSIDE_GUEST_MODE
[3557.034488] kvm_mroe: cpu mode = OUTSIDE_GUEST_MODE
[3557.034490] kvm_mroe: flush state = Waiting

For some reason kvm_vcpu_kick() didn't force the KVM_REQ_TLB_FLUSH to
kick into the virtual cpu (I am talking about the last 2 lines).

I am aware that there is still alot missing (like dealing with malicious
guest remappings) and the code quality sucks, but any ideas about what I
could be doing wrong (or ideas in general) would be apprciated. I am
already planning to do everything cleanly once it works.

Thansk.

Signed-off-by: Ahmed Abd El Mawgood <ahmedsoliman0x666@gmail.com>
---
 arch/x86/include/asm/kvm_host.h |   7 ++-
 arch/x86/kvm/Kconfig            |   7 +++
 arch/x86/kvm/mmu.c              | 127 +++++++++++++++++++++++++++-------------
 arch/x86/kvm/x86.c              |  83 ++++++++++++++++++++++++--
 include/linux/kvm_host.h        |  17 ++++++
 include/uapi/linux/kvm_para.h   |   4 +-
 virt/kvm/kvm_main.c             |  36 +++++++++---
 7 files changed, 226 insertions(+), 55 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c13cd28d9d1b..c66e9245f750 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -235,7 +235,10 @@ struct kvm_mmu_memory_cache {
 	int nobjs;
 	void *objects[KVM_NR_MEM_OBJS];
 };
-
+struct kvm_write_access_data {
+	int i;
+	struct kvm_memory_slot *memslot;
+};
 /*
  * the pages used as guest page table on soft mmu are tracked by
  * kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used
@@ -1130,7 +1133,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 		u64 acc_track_mask, u64 me_mask);
 
 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
+void kvm_mmu_slot_apply_write_access(struct kvm *kvm,
 				      struct kvm_memory_slot *memslot);
 void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
 				   const struct kvm_memory_slot *memslot);
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 92fd433c50b9..8ae822a8dc7a 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -96,6 +96,13 @@ config KVM_MMU_AUDIT
 	 This option adds a R/W kVM module parameter 'mmu_audit', which allows
 	 auditing of KVM MMU events at runtime.
 
+config KVM_MROE
+	bool "Hypercall Memory Read-Only Enforcement"
+	depends on KVM && X86
+	help
+	This option add KVM_HC_HMROE hypercall to kvm which as hardening
+	mechanism to protect memory pages from being edited.
+
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
 source drivers/vhost/Kconfig
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d594690d8b95..946545b8b8cb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -70,7 +70,7 @@ enum {
 #undef MMU_DEBUG
 
 #ifdef MMU_DEBUG
-static bool dbg = 0;
+static bool dbg = 1;
 module_param(dbg, bool, 0644);
 
 #define pgprintk(x...) do { if (dbg) printk(x); } while (0)
@@ -1402,7 +1402,6 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
 static bool spte_write_protect(u64 *sptep, bool pt_protect)
 {
 	u64 spte = *sptep;
-
 	if (!is_writable_pte(spte) &&
 	      !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
 		return false;
@@ -1418,15 +1417,23 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
 
 static bool __rmap_write_protect(struct kvm *kvm,
 				 struct kvm_rmap_head *rmap_head,
-				 bool pt_protect)
+				 bool pt_protect,
+				 struct kvm_write_access_data *d)
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
 	bool flush = false;
-
-	for_each_rmap_spte(rmap_head, &iter, sptep)
-		flush |= spte_write_protect(sptep, pt_protect);
-
+	if (d == NULL) {
+		for_each_rmap_spte(rmap_head, &iter, sptep) {
+			flush |= spte_write_protect(sptep,
+				!test_bit(d->i, d->memslot->mroe_bitmap)
+				&& pt_protect);
+			d->i++;
+		}
+	} else {
+		for_each_rmap_spte(rmap_head, &iter, sptep)
+			flush |= spte_write_protect(sptep, pt_protect);
+	}
 	return flush;
 }
 
@@ -1457,7 +1464,8 @@ static bool wrprot_ad_disabled_spte(u64 *sptep)
  *	- W bit on ad-disabled SPTEs.
  * Returns true iff any D or W bits were cleared.
  */
-static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
+static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+		void *data)
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
@@ -1483,7 +1491,8 @@ static bool spte_set_dirty(u64 *sptep)
 	return mmu_spte_update(sptep, spte);
 }
 
-static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
+static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+		void *data)
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
@@ -1515,7 +1524,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
 	while (mask) {
 		rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
 					  PT_PAGE_TABLE_LEVEL, slot);
-		__rmap_write_protect(kvm, rmap_head, false);
+		__rmap_write_protect(kvm, rmap_head, false, NULL);
 
 		/* clear the first set bit */
 		mask &= mask - 1;
@@ -1541,7 +1550,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
 	while (mask) {
 		rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
 					  PT_PAGE_TABLE_LEVEL, slot);
-		__rmap_clear_dirty(kvm, rmap_head);
+		__rmap_clear_dirty(kvm, rmap_head, NULL);
 
 		/* clear the first set bit */
 		mask &= mask - 1;
@@ -1591,10 +1600,14 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
 	struct kvm_rmap_head *rmap_head;
 	int i;
 	bool write_protected = false;
-
+	struct kvm_write_access_data data = {
+		.i = 0,
+		.memslot = slot,
+	};
 	for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
 		rmap_head = __gfn_to_rmap(gfn, i, slot);
-		write_protected |= __rmap_write_protect(kvm, rmap_head, true);
+		write_protected |= __rmap_write_protect(kvm, rmap_head, true,
+				&data);
 	}
 
 	return write_protected;
@@ -1608,7 +1621,8 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
 	return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
 }
 
-static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
+static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+		void *data)
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
@@ -1628,7 +1642,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 			   struct kvm_memory_slot *slot, gfn_t gfn, int level,
 			   unsigned long data)
 {
-	return kvm_zap_rmapp(kvm, rmap_head);
+	return kvm_zap_rmapp(kvm, rmap_head, NULL);
 }
 
 static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
@@ -5086,13 +5100,15 @@ void kvm_mmu_uninit_vm(struct kvm *kvm)
 }
 
 /* The return value indicates if tlb flush on all vcpus is needed. */
-typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
+typedef bool (*slot_level_handler) (struct kvm *kvm,
+		struct kvm_rmap_head *rmap_head, void *data);
 
 /* The caller should hold mmu-lock before calling this function. */
 static __always_inline bool
 slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			slot_level_handler fn, int start_level, int end_level,
-			gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
+			gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb,
+			void *data)
 {
 	struct slot_rmap_walk_iterator iterator;
 	bool flush = false;
@@ -5100,7 +5116,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
 	for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
 			end_gfn, &iterator) {
 		if (iterator.rmap)
-			flush |= fn(kvm, iterator.rmap);
+			flush |= fn(kvm, iterator.rmap, data);
 
 		if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
 			if (flush && lock_flush_tlb) {
@@ -5122,36 +5138,36 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
 static __always_inline bool
 slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
 		  slot_level_handler fn, int start_level, int end_level,
-		  bool lock_flush_tlb)
+		  bool lock_flush_tlb, void *data)
 {
 	return slot_handle_level_range(kvm, memslot, fn, start_level,
 			end_level, memslot->base_gfn,
 			memslot->base_gfn + memslot->npages - 1,
-			lock_flush_tlb);
+			lock_flush_tlb, data);
 }
 
 static __always_inline bool
 slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
-		      slot_level_handler fn, bool lock_flush_tlb)
+		      slot_level_handler fn, bool lock_flush_tlb, void *data)
 {
 	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
-				 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+				 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb, data);
 }
 
 static __always_inline bool
 slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
-			slot_level_handler fn, bool lock_flush_tlb)
+			slot_level_handler fn, bool lock_flush_tlb, void *data)
 {
 	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
-				 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+				 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb, data);
 }
 
 static __always_inline bool
 slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
-		 slot_level_handler fn, bool lock_flush_tlb)
+		 slot_level_handler fn, bool lock_flush_tlb, void *data)
 {
 	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
-				 PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
+				 PT_PAGE_TABLE_LEVEL, lock_flush_tlb, data);
 }
 
 void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
@@ -5173,7 +5189,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
 
 			slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
 						PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
-						start, end - 1, true);
+						start, end - 1, true, NULL);
 		}
 	}
 
@@ -5181,23 +5197,52 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
 }
 
 static bool slot_rmap_write_protect(struct kvm *kvm,
-				    struct kvm_rmap_head *rmap_head)
+				    struct kvm_rmap_head *rmap_head,
+				    void *data)
 {
-	return __rmap_write_protect(kvm, rmap_head, false);
+	return __rmap_write_protect(kvm, rmap_head, false,
+			(struct kvm_write_access_data *)data);
 }
 
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
+static bool slot_rmap_apply_protection(struct kvm *kvm,
+				      struct kvm_rmap_head *rmap_head,
+				      void *data)
+{
+	struct kvm_write_access_data *d = (struct kvm_write_access_data *) data;
+	unsigned long *protection = d->memslot->mroe_bitmap;
+	bool prot_mask = d->memslot->flags & KVM_MEM_READONLY;
+	u64 *sptep;
+	struct rmap_iterator iter;
+	bool flush = false;
+
+	for_each_rmap_spte(rmap_head, &iter, sptep) {
+		flush |= spte_write_protect(sptep,
+				!(test_bit(d->i, protection) || prot_mask));
+		if (test_bit(d->i, protection)) {
+			pr_info("%s: The %dth page is readonly, Flush = %s\n",
+				 __func__, d->i, flush?"True" : "False");
+		}
+		d->i++;
+	}
+	return flush;
+}
+
+void kvm_mmu_slot_apply_write_access(struct kvm *kvm,
 				      struct kvm_memory_slot *memslot)
 {
 	bool flush;
-
+	struct kvm_write_access_data data = {
+		.i = 0,
+		.memslot = memslot,
+	};
 	spin_lock(&kvm->mmu_lock);
-	flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
-				      false);
+	flush = slot_handle_all_level(kvm, memslot, slot_rmap_apply_protection,
+				      false, &data);
+	pr_info("%s: Flush = %s\n", __func__, flush ? "true":"false");
 	spin_unlock(&kvm->mmu_lock);
 
 	/*
-	 * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log()
+	 * kvm_mmu_slot_apply_write_access() and kvm_vm_ioctl_get_dirty_log()
 	 * which do tlb flush out of mmu-lock should be serialized by
 	 * kvm->slots_lock otherwise tlb flush would be missed.
 	 */
@@ -5219,7 +5264,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
 }
 
 static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
-					 struct kvm_rmap_head *rmap_head)
+					 struct kvm_rmap_head *rmap_head,
+					 void *data)
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
@@ -5257,7 +5303,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
 	/* FIXME: const-ify all uses of struct kvm_memory_slot.  */
 	spin_lock(&kvm->mmu_lock);
 	slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
-			 kvm_mmu_zap_collapsible_spte, true);
+			 kvm_mmu_zap_collapsible_spte, true, NULL);
 	spin_unlock(&kvm->mmu_lock);
 }
 
@@ -5267,7 +5313,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
 	bool flush;
 
 	spin_lock(&kvm->mmu_lock);
-	flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
+	flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false, NULL);
 	spin_unlock(&kvm->mmu_lock);
 
 	lockdep_assert_held(&kvm->slots_lock);
@@ -5290,10 +5336,10 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
 
 	spin_lock(&kvm->mmu_lock);
 	flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
-					false);
+					false, NULL);
 	spin_unlock(&kvm->mmu_lock);
 
-	/* see kvm_mmu_slot_remove_write_access */
+	/* see kvm_mmu_slot_apply_write_access */
 	lockdep_assert_held(&kvm->slots_lock);
 
 	if (flush)
@@ -5307,7 +5353,8 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
 	bool flush;
 
 	spin_lock(&kvm->mmu_lock);
-	flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
+	flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false,
+			NULL);
 	spin_unlock(&kvm->mmu_lock);
 
 	lockdep_assert_held(&kvm->slots_lock);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0046aa70205a..96e967199fda 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -55,7 +55,7 @@
 #include <linux/irqbypass.h>
 #include <linux/sched/stat.h>
 #include <linux/mem_encrypt.h>
-
+#include <linux/mempolicy.h>
 #include <trace/events/kvm.h>
 
 #include <asm/debugreg.h>
@@ -4177,7 +4177,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 
 	/*
 	 * All the TLBs can be flushed out of mmu lock, see the comments in
-	 * kvm_mmu_slot_remove_write_access().
+	 * kvm_mmu_slot_apply_write_access().
 	 */
 	lockdep_assert_held(&kvm->slots_lock);
 	if (is_dirty)
@@ -6669,7 +6669,74 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
 	return ret;
 }
 #endif
+#ifdef CONFIG_KVM_MROE
+static int roe_protect_frame(struct kvm *kvm, gpa_t gpa)
+{
+	struct kvm_memory_slot *slot;
+	gfn_t gfn = gpa >> PAGE_SHIFT;
+
+	slot = gfn_to_memslot(kvm, gfn);
+	//XXX do some error checking dude.
+	if (gfn > slot->base_gfn + slot->npages) {
+		//XXX use a better language
+		pr_err("You have an overflow\n");
+		return -1;
+	}
+	pr_info("Setting page number %lld in slot number %d\n",
+		gfn - slot->base_gfn, slot->id);
+	// something is wrong with the locking here
+	// you should lock the area before writing the bit
+	set_bit(gfn - slot->base_gfn, slot->mroe_bitmap);
+	kvm_mmu_slot_apply_write_access(kvm, slot);
+	return 0;
+}
+void debug_cpu_mode(struct kvm_vcpu *vcpu)
+{
+	char *mode = "Unknown";
+
+	if (vcpu->mode == OUTSIDE_GUEST_MODE)
+		mode = "OUTSIDE_GUEST_MODE";
+	else if (vcpu->mode == IN_GUEST_MODE)
+		mode = "IN_GUEST_MODE";
+	else if (vcpu->mode == EXITING_GUEST_MODE)
+		mode = "EXITING_GUEST_MODE";
+	else if (vcpu->mode == READING_SHADOW_PAGE_TABLES)
+		mode = "READING_SHADOW_PAGE_TABLES";
+	pr_info("kvm_mroe: cpu mode = %s\n", mode);
+}
+static int kvm_mroe(struct kvm_vcpu *vcpu, u64 gva)
+{
+	struct kvm *kvm = vcpu->kvm;
+	gpa_t gpa;
+	u64 hva;
+	int ret;
 
+	//XXX check that the hypercall is done from kernel mode
+	if (gva & ~PAGE_MASK)
+		return -EINVAL;
+	gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
+	hva = gfn_to_hva(kvm, gpa >> PAGE_SHIFT);
+	//XXX This doesn't work but it will be ok to check that we can access
+	// the address and make sure that the mapping makes sense
+	if (!access_ok(VERIFY_WRITE, hva, PAGE_SIZE)) {
+		pr_info("Duplicate request\n");
+		return -KVM_EROEDUPLICATR;
+	}
+	pr_info("%s: flush state = %s\n", __func__,
+			kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu) ? "Waiting" :
+								     "Done");
+	debug_cpu_mode(vcpu);
+	ret = roe_protect_frame(vcpu->kvm, gpa);
+	debug_cpu_mode(vcpu);
+	kvm_vcpu_kick(vcpu);
+	debug_cpu_mode(vcpu);
+	pr_info("%s: flush state = %s\n", __func__,
+			kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu) ? "Waiting" :
+								     "Done");
+
+	return ret;
+}
+#endif
 /*
  * kvm_pv_kick_cpu_op:  Kick a vcpu.
  *
@@ -6737,6 +6804,12 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 	case KVM_HC_CLOCK_PAIRING:
 		ret = kvm_pv_clock_pairing(vcpu, a0, a1);
 		break;
+#endif
+#ifdef CONFIG_KVM_MROE
+	case KVM_HC_HMROE:
+		pr_info("Hypercall received, page address 0x%lx\n", a0);
+		ret = kvm_mroe(vcpu, a0);
+		break;
 #endif
 	default:
 		ret = -KVM_ENOSYS;
@@ -8971,8 +9044,10 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
 				     struct kvm_memory_slot *new)
 {
 	/* Still write protect RO slot */
+	pr_info("%s: visited\n", __func__);
+	kvm_mmu_slot_apply_write_access(kvm, new);
+	return;
 	if (new->flags & KVM_MEM_READONLY) {
-		kvm_mmu_slot_remove_write_access(kvm, new);
 		return;
 	}
 
@@ -9010,7 +9085,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
 		if (kvm_x86_ops->slot_enable_log_dirty)
 			kvm_x86_ops->slot_enable_log_dirty(kvm, new);
 		else
-			kvm_mmu_slot_remove_write_access(kvm, new);
+			kvm_mmu_slot_apply_write_access(kvm, new);
 	} else {
 		if (kvm_x86_ops->slot_disable_log_dirty)
 			kvm_x86_ops->slot_disable_log_dirty(kvm, new);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4ee7bc548a83..1ca6db7b8931 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -7,6 +7,7 @@
  */
 
 #include <linux/types.h>
+#include <linux/hashtable.h>
 #include <linux/hardirq.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
@@ -297,6 +298,9 @@ static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
 struct kvm_memory_slot {
 	gfn_t base_gfn;
 	unsigned long npages;
+#ifdef CONFIG_KVM_MROE
+	unsigned long *mroe_bitmap;
+#endif
 	unsigned long *dirty_bitmap;
 	struct kvm_arch_memory_slot arch;
 	unsigned long userspace_addr;
@@ -387,6 +391,13 @@ struct kvm_memslots {
 	int used_slots;
 };
 
+#ifdef CONFIG_KVM_MROE
+struct roe_page {
+	void *page_start;
+	struct hlist_node hash_list;
+};
+#endif
+
 struct kvm {
 	spinlock_t mmu_lock;
 	struct mutex slots_lock;
@@ -440,6 +451,12 @@ struct kvm {
 	unsigned long mmu_notifier_seq;
 	long mmu_notifier_count;
 #endif
+
+#ifdef CONFIG_KVM_MROE
+	//TODO tune hash size;
+	#define KVM_MROE_HASH_SIZE 8
+	DECLARE_HASHTABLE(roe_pages, KVM_MROE_HASH_SIZE);
+#endif
 	long tlbs_dirty;
 	struct list_head devices;
 	struct dentry *debugfs_dentry;
diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
index dcf629dd2889..2be960477649 100644
--- a/include/uapi/linux/kvm_para.h
+++ b/include/uapi/linux/kvm_para.h
@@ -17,6 +17,8 @@
 #define KVM_EPERM		EPERM
 #define KVM_EOPNOTSUPP		95
 
+#define KVM_EROEDUPLICATR	1
+
 #define KVM_HC_VAPIC_POLL_IRQ		1
 #define KVM_HC_MMU_OP			2
 #define KVM_HC_FEATURES			3
@@ -26,7 +28,7 @@
 #define KVM_HC_MIPS_EXIT_VM		7
 #define KVM_HC_MIPS_CONSOLE_OUTPUT	8
 #define KVM_HC_CLOCK_PAIRING		9
-
+#define KVM_HC_HMROE			10
 /*
  * hypercalls use architecture specific
  */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8b47507faab5..ca1b95a16a8b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -634,7 +634,6 @@ static struct kvm *kvm_create_vm(unsigned long type)
 	mutex_init(&kvm->slots_lock);
 	refcount_set(&kvm->users_count, 1);
 	INIT_LIST_HEAD(&kvm->devices);
-
 	r = kvm_arch_init_vm(kvm, type);
 	if (r)
 		goto out_err_no_disable;
@@ -794,6 +793,17 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
 	return 0;
 }
 
+static int kvm_init_mroe_bitmap(struct kvm_memory_slot *slot)
+{
+#ifdef CONFIG_KVM_MROE
+	slot->mroe_bitmap = kvzalloc(BITS_TO_LONGS(slot->npages) *
+			sizeof(unsigned long), GFP_KERNEL);
+	if (!slot->mroe_bitmap)
+		return -ENOMEM;
+#endif
+	return 0;
+}
+
 /*
  * Insert memslot and re-sort memslots based on their GFN,
  * so binary search could be used to lookup GFN.
@@ -1011,7 +1021,8 @@ int __kvm_set_memory_region(struct kvm *kvm,
 		if (kvm_create_dirty_bitmap(&new) < 0)
 			goto out_free;
 	}
-
+	if (kvm_init_mroe_bitmap(&new) < 0)
+		goto out_free;
 	slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
 	if (!slots)
 		goto out_free;
@@ -1263,16 +1274,25 @@ static bool memslot_is_readonly(struct kvm_memory_slot *slot)
 {
 	return slot->flags & KVM_MEM_READONLY;
 }
-
+static bool gfn_is_readonly(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+#ifdef CONFIG_KVM_MROE
+	pr_info("%s: test_bit = %d", __func__,
+			test_bit(gfn - slot->base_gfn, slot->mroe_bitmap));
+	///dump_stack();
+	return test_bit(gfn - slot->base_gfn, slot->mroe_bitmap) ||
+		memslot_is_readonly(slot);
+#else
+	return memslot_is_readonly(slot);
+#endif
+}
 static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
 				       gfn_t *nr_pages, bool write)
 {
 	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
 		return KVM_HVA_ERR_BAD;
-
-	if (memslot_is_readonly(slot) && write)
+	if (gfn_is_readonly(slot, gfn) && write)
 		return KVM_HVA_ERR_RO_BAD;
-
 	if (nr_pages)
 		*nr_pages = slot->npages - (gfn - slot->base_gfn);
 
@@ -1314,7 +1334,7 @@ unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
 	unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
 
 	if (!kvm_is_error_hva(hva) && writable)
-		*writable = !memslot_is_readonly(slot);
+		*writable = !gfn_is_readonly(slot, gfn);
 
 	return hva;
 }
@@ -1554,7 +1574,7 @@ kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
 	}
 
 	/* Do not map writable pfn in the readonly memslot. */
-	if (writable && memslot_is_readonly(slot)) {
+	if (writable && gfn_is_readonly(slot, gfn)) {
 		*writable = false;
 		writable = NULL;
 	}
-- 
2.16.4


^ permalink raw reply related	[flat|nested] 3+ messages in thread