##############################################################################
# kvm-mwait-emu.patch (Gabriel Somlo <somlo@cmu.edu>)
# GLS: emulate MONITOR and MWAIT at page-level granularity by write-protecting
#       the page containing a monitored location and appropriately handling
#       subsequent write faults.
# NOTE: So far, this only works somewhat reliably on single-VCPU non-SMP
#       guests, so it may, for all I know, do nasty things to your computer
#       and/or other worldly possessions, loved ones, pets, etc. :)
#                       You Have Been Warned !
#       Also, after debugging the SMP issue, we'll need a way to trigger a
#       periodic cleanup that will switch write-protected monitored pages
#       back to read-write, once they've stayed unused for "long enough"
##############################################################################
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fdf83af..7ca9b51 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -337,6 +337,16 @@ struct kvm_pmu {
 	u64 reprogram_pmi;
 };
 
+/*
+ * mwait-monitored page list element type
+ */
+struct kvm_mwait_pg {
+	gpa_t gpa;
+	struct list_head vcpu_list; /* VCPUs monitoring (armed on) this page */
+	struct list_head link;      /* links mwait-pages within a KVM */
+	unsigned accessed;
+};
+
 struct kvm_vcpu_arch {
 	/*
 	 * rip and regs accesses must go through
@@ -528,6 +538,10 @@ struct kvm_vcpu_arch {
 	struct {
 		bool pv_unhalted;
 	} pv;
+
+	/* MONITOR/MWAIT support */
+	struct kvm_mwait_pg *mwp;	/* page monitored by this VCPU */
+	struct list_head mw_link;	/* all VCPUs monitoring the same page */
 };
 
 struct kvm_lpage_info {
@@ -607,6 +621,10 @@ struct kvm_arch {
 	u64 hv_hypercall;
 	u64 hv_tsc_page;
 
+	/* MONITOR/MWAIT support */
+	struct mutex mwait_lock;
+	struct list_head mwait_pg_list;	/* monitored pages within this KVM */
+
 	#ifdef CONFIG_KVM_MMU_AUDIT
 	int audit_point;
 	#endif
@@ -854,6 +872,8 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port);
 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
 int kvm_emulate_halt(struct kvm_vcpu *vcpu);
 int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
+int kvm_emulate_monitor(struct kvm_vcpu *vcpu);
+int kvm_emulate_mwait(struct kvm_vcpu *vcpu);
 
 void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
@@ -915,6 +935,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		       const u8 *new, int bytes);
 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
+int kvm_mmu_protect_page(struct kvm *kvm, gfn_t gfn);
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
 int kvm_mmu_load(struct kvm_vcpu *vcpu);
 void kvm_mmu_unload(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index c697625..7d4f1ca 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -279,6 +279,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
 	/* cpuid 1.ecx */
 	const u32 kvm_supported_word4_x86_features =
+		/* OS X does not check CPUID before using MONITOR/MWAIT from its
+		 * power-optimized idle loop (AppleIntelPowerManagement.kext).
+		 * For now, we don't advertise MWAIT support below, but attempt
+		 * to emulate them instead of issuing an invalid opcode fault
+		 * if a misbehaving guest calls them anyway. Removing the above
+		 * mentioned kext from OS X will cause it to fall back to a
+		 * HLT-based idle loop, as an optional guest optimization step.
+		 */
 		F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
 		0 /* DS-CPL, VMX, SMX, EST */ |
 		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e50425d..bc02ebd 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2283,6 +2283,20 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
 
+int kvm_mmu_protect_page(struct kvm *kvm, gfn_t gfn)
+{
+	int r;
+
+	spin_lock(&kvm->mmu_lock);
+	r = rmap_write_protect(kvm, gfn);
+	if (r)
+		kvm_flush_remote_tlbs(kvm);
+	spin_unlock(&kvm->mmu_lock);
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_protect_page);
+
 /*
  * The function is based on mtrr_type_lookup() in
  * arch/x86/kernel/cpu/mtrr/generic.c
@@ -4146,12 +4160,68 @@ static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr)
 	return vcpu_match_mmio_gva(vcpu, addr);
 }
 
+// try to handle fault caused by write to monitored (mwait) page
+// FIXME: aim for better integration between this and FNAME(page_fault)() and
+// kvm_mmu_page_fault() below. For now, this is proof-of-concept code.
+static bool handle_mwait_write_fault(struct kvm_vcpu *vcpu, gva_t gva,
+					void *in, int in_len)
+{
+	gpa_t gpa;
+	struct kvm_mwait_pg *p, *mwp = NULL;
+	struct kvm_vcpu_arch *v, *u;
+	bool r = false;
+
+	gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
+	if (gpa == UNMAPPED_GVA)
+		goto ul_out;
+
+	mutex_lock(&vcpu->kvm->arch.mwait_lock);
+
+	/* is gpa matching a monitored (mwait) page? */
+	list_for_each_entry(p, &vcpu->kvm->arch.mwait_pg_list, link)
+		if (p->gpa == gpa) {
+			mwp = p;
+			break;
+		}
+	if (mwp == NULL)
+		goto out;
+
+	mwp->accessed = 1;
+
+	if (x86_emulate_instruction(vcpu, gva,
+				    EMULTYPE_RETRY, in, in_len) != EMULATE_DONE)
+		goto out;
+
+	/* disarm all VCPUs monitoring this page, waking them if needed */
+	list_for_each_entry_safe(v, u, &mwp->vcpu_list, mw_link) {
+		list_del(&v->mw_link);
+		v->mwp = NULL;
+		if (v->mp_state == KVM_MP_STATE_MWAIT)
+			v->mp_state = KVM_MP_STATE_RUNNABLE;
+	}
+
+	// What if the mwait is woken up by an interrupt instead of a write ?
+	// It might remain "armed" on its old mwait page, but any subsequent
+	// MONITOR instruction would replace that, so I don't think we need
+	// to worry about it...
+
+	r = true;
+out:
+	mutex_unlock(&vcpu->kvm->arch.mwait_lock);
+ul_out:
+	return r;
+}
+
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
 		       void *insn, int insn_len)
 {
 	int r, emulation_type = EMULTYPE_RETRY;
 	enum emulation_result er;
 
+	/* writing to MONITORed memory area ? */
+	if (handle_mwait_write_fault(vcpu, cr2, insn, insn_len))
+		return 1;
+
 	r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
 	if (r < 0)
 		goto out;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e81df8f..638704c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3262,6 +3262,18 @@ static int pause_interception(struct vcpu_svm *svm)
 	return 1;
 }
 
+static int monitor_interception(struct vcpu_svm *svm)
+{
+	skip_emulated_instruction(&(svm->vcpu));
+	return kvm_emulate_monitor(&(svm->vcpu));
+}
+
+static int mwait_interception(struct vcpu_svm *svm)
+{
+	skip_emulated_instruction(&(svm->vcpu));
+	return kvm_emulate_mwait(&(svm->vcpu));
+}
+
 static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_READ_CR0]			= cr_interception,
 	[SVM_EXIT_READ_CR3]			= cr_interception,
@@ -3319,8 +3331,8 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_CLGI]				= clgi_interception,
 	[SVM_EXIT_SKINIT]			= skinit_interception,
 	[SVM_EXIT_WBINVD]                       = emulate_on_interception,
-	[SVM_EXIT_MONITOR]			= invalid_op_interception,
-	[SVM_EXIT_MWAIT]			= invalid_op_interception,
+	[SVM_EXIT_MONITOR]			= monitor_interception,
+	[SVM_EXIT_MWAIT]			= mwait_interception,
 	[SVM_EXIT_XSETBV]			= xsetbv_interception,
 	[SVM_EXIT_NPF]				= pf_interception,
 };
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a06f101..a7382e1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5603,6 +5603,18 @@ static int handle_invalid_op(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int handle_monitor(struct kvm_vcpu *vcpu)
+{
+	skip_emulated_instruction(vcpu);
+	return kvm_emulate_monitor(vcpu);
+}
+
+static int handle_mwait(struct kvm_vcpu *vcpu)
+{
+	skip_emulated_instruction(vcpu);
+	return kvm_emulate_mwait(vcpu);
+}
+
 /*
  * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
  * We could reuse a single VMCS for all the L2 guests, but we also want the
@@ -6483,8 +6495,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_EPT_VIOLATION]	      = handle_ept_violation,
 	[EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
 	[EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
-	[EXIT_REASON_MWAIT_INSTRUCTION]	      = handle_invalid_op,
-	[EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,
+	[EXIT_REASON_MWAIT_INSTRUCTION]	      = handle_mwait,
+	[EXIT_REASON_MONITOR_INSTRUCTION]     = handle_monitor,
 	[EXIT_REASON_INVEPT]                  = handle_invept,
 };
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 39c28f09..8edc1be 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5592,6 +5592,70 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
 
+int kvm_emulate_monitor(struct kvm_vcpu *vcpu)
+{
+	gva_t gva;
+	gpa_t gpa;
+	struct kvm_mwait_pg *p;
+
+	/* emulate as NOP if no-kvm-irqchip */
+	if (!irqchip_in_kernel(vcpu->kvm))
+		return 1;
+
+	mutex_lock(&vcpu->kvm->arch.mwait_lock);
+
+	/* relinguish any previously monitored mwait page */
+	if (vcpu->arch.mwp != NULL) {
+		list_del(&vcpu->arch.mw_link);
+		vcpu->arch.mwp->accessed = 1;
+		vcpu->arch.mwp = NULL;
+	}
+
+	gva = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
+	if (gpa == UNMAPPED_GVA)
+		goto out;       /* let some write op map the page first */
+
+	/* does the mwait page we're looking for already exist? */
+	list_for_each_entry(p, &vcpu->kvm->arch.mwait_pg_list, link)
+		if (p->gpa == gpa) {
+			vcpu->arch.mwp = p;
+			break;
+		}
+	if (vcpu->arch.mwp == NULL) { /* no, add new mwait page */
+		if (!kvm_mmu_protect_page(vcpu->kvm, gpa_to_gfn(gpa)))
+			goto out;
+		p = kmalloc(sizeof(struct kvm_mwait_pg), GFP_KERNEL);
+		p->gpa = gpa;
+		INIT_LIST_HEAD(&p->vcpu_list);
+		list_add(&p->link, &vcpu->kvm->arch.mwait_pg_list);
+
+		vcpu->arch.mwp = p;
+	}
+
+	/* link this VCPU into list of VCPUs monitoring this mwait page */
+	list_add(&vcpu->arch.mw_link, &vcpu->arch.mwp->vcpu_list);
+
+out:
+	mutex_unlock(&vcpu->kvm->arch.mwait_lock);
+	return 1;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_monitor);
+
+int kvm_emulate_mwait(struct kvm_vcpu *vcpu)
+{
+	/* emulate as NOP if no-kvm-irqchip */
+	if (!irqchip_in_kernel(vcpu->kvm))
+		return 1;
+
+	mutex_lock(&vcpu->kvm->arch.mwait_lock);
+	if (vcpu->arch.mwp != NULL)
+		vcpu->arch.mp_state = KVM_MP_STATE_MWAIT;
+	mutex_unlock(&vcpu->kvm->arch.mwait_lock);
+	return 1;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_mwait);
+
 int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 {
 	u64 param, ingpa, outgpa, ret;
@@ -6077,6 +6141,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 			if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) {
 				kvm_apic_accept_events(vcpu);
 				switch(vcpu->arch.mp_state) {
+				case KVM_MP_STATE_MWAIT:
 				case KVM_MP_STATE_HALTED:
 					vcpu->arch.pv.pv_unhalted = false;
 					vcpu->arch.mp_state =
@@ -6961,6 +7026,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	kvm_async_pf_hash_reset(vcpu);
 	kvm_pmu_init(vcpu);
 
+	vcpu->arch.mwp = NULL;
+
 	return 0;
 fail_free_wbinvd_dirty_mask:
 	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
@@ -7013,6 +7080,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
 	pvclock_update_vm_gtod_copy(kvm);
 
+	mutex_init(&kvm->arch.mwait_lock);
+	INIT_LIST_HEAD(&kvm->arch.mwait_pg_list);
+
 	return 0;
 }
 
@@ -7254,8 +7324,10 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 		|| kvm_apic_has_events(vcpu)
 		|| vcpu->arch.pv.pv_unhalted
 		|| atomic_read(&vcpu->arch.nmi_queued) ||
-		(kvm_arch_interrupt_allowed(vcpu) &&
-		 kvm_cpu_has_interrupt(vcpu));
+		(kvm_cpu_has_interrupt(vcpu) &&
+		 (kvm_arch_interrupt_allowed(vcpu) ||
+		  (vcpu->arch.mp_state == KVM_MP_STATE_MWAIT &&
+		   kvm_register_read(vcpu, VCPU_REGS_RCX) & 0x01)));
 }
 
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 932d7f2..a4925fc 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -398,6 +398,7 @@ struct kvm_vapic_addr {
 #define KVM_MP_STATE_INIT_RECEIVED     2
 #define KVM_MP_STATE_HALTED            3
 #define KVM_MP_STATE_SIPI_RECEIVED     4
+#define KVM_MP_STATE_MWAIT             5
 
 struct kvm_mp_state {
 	__u32 mp_state;
