Merge branch 'kvm-updates/2.6.35' of git://git.kernel.org/pub/scm/virt/kvm/kvm

* 'kvm-updates/2.6.35' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (269 commits) KVM: x86: Add missing locking to arch specific vcpu ioctls KVM: PPC: Add missing vcpu_load()/vcpu_put() in vcpu ioctls KVM: MMU: Segregate shadow pages with different cr0.wp KVM: x86: Check LMA bit before set_efer KVM: Don't allow lmsw to clear cr0.pe KVM: Add cpuid.txt file KVM: x86: Tell the guest we'll warn it about tsc stability x86, paravirt: don't compute pvclock adjustments if we trust the tsc x86: KVM guest: Try using new kvm clock msrs KVM: x86: export paravirtual cpuid flags in KVM_GET_SUPPORTED_CPUID KVM: x86: add new KVMCLOCK cpuid feature KVM: x86: change msr numbers for kvmclock x86, paravirt: Add a global synchronization point for pvclock x86, paravirt: Enable pvclock flags in vcpu_time_info structure KVM: x86: Inject #GP with the right rip on efer writes KVM: SVM: Don't allow nested guest to VMMCALL into host KVM: x86: Fix exception reinjection forced to true KVM: Fix wallclock version writing race KVM: MMU: Don't read pdptrs with mmu spinlock held in mmu_alloc_roots KVM: VMX: enable VMXON check with SMX enabled (Intel TXT) ...

Merge branch 'kvm-updates/2.6.35' of git://git.kernel.org/pub/scm/virt/kvm/kvm
* 'kvm-updates/2.6.35' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (269 commits) KVM: x86: Add missing locking to arch specific vcpu ioctls KVM: PPC: Add missing vcpu_load()/vcpu_put() in vcpu ioctls KVM: MMU: Segregate shadow pages with different cr0.wp KVM: x86: Check LMA bit before set_efer KVM: Don't allow lmsw to clear cr0.pe KVM: Add cpuid.txt file KVM: x86: Tell the guest we'll warn it about tsc stability x86, paravirt: don't compute pvclock adjustments if we trust the tsc x86: KVM guest: Try using new kvm clock msrs KVM: x86: export paravirtual cpuid flags in KVM_GET_SUPPORTED_CPUID KVM: x86: add new KVMCLOCK cpuid feature KVM: x86: change msr numbers for kvmclock x86, paravirt: Add a global synchronization point for pvclock x86, paravirt: Enable pvclock flags in vcpu_time_info structure KVM: x86: Inject #GP with the right rip on efer writes KVM: SVM: Don't allow nested guest to VMMCALL into host KVM: x86: Fix exception reinjection forced to true KVM: Fix wallclock version writing race KVM: MMU: Don't read pdptrs with mmu spinlock held in mmu_alloc_roots KVM: VMX: enable VMXON check with SMX enabled (Intel TXT) ...
98edb6ca · Linus Torvalds · a8251096 · 8fbf065d · 98edb6ca · 98edb6ca
Commit 98edb6ca authored 15 years ago by Linus Torvalds
20 changed files
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -656,6 +656,7 @@ struct kvm_clock_data {
 4.29 KVM_GET_VCPU_EVENTS

 Capability: KVM_CAP_VCPU_EVENTS
+Extended by: KVM_CAP_INTR_SHADOW
 Architectures: x86
 Type: vm ioctl
 Parameters: struct kvm_vcpu_event (out)
@@ -676,7 +677,7 @@ struct kvm_vcpu_events {
 		__u8 injected;
 		__u8 nr;
 		__u8 soft;
-		__u8 pad;
+		__u8 shadow;
 	} interrupt;
 	struct {
 		__u8 injected;
@@ -688,9 +689,13 @@ struct kvm_vcpu_events {
 	__u32 flags;
 };

+KVM_VCPUEVENT_VALID_SHADOW may be set in the flags field to signal that
+interrupt.shadow contains a valid state. Otherwise, this field is undefined.
+
 4.30 KVM_SET_VCPU_EVENTS

 Capability: KVM_CAP_VCPU_EVENTS
+Extended by: KVM_CAP_INTR_SHADOW
 Architectures: x86
 Type: vm ioctl
 Parameters: struct kvm_vcpu_event (in)
@@ -709,6 +714,183 @@ current in-kernel state. The bits are:
 KVM_VCPUEVENT_VALID_NMI_PENDING - transfer nmi.pending to the kernel
 KVM_VCPUEVENT_VALID_SIPI_VECTOR - transfer sipi_vector

+If KVM_CAP_INTR_SHADOW is available, KVM_VCPUEVENT_VALID_SHADOW can be set in
+the flags field to signal that interrupt.shadow contains a valid state and
+shall be written into the VCPU.
+
+4.32 KVM_GET_DEBUGREGS
+
+Capability: KVM_CAP_DEBUGREGS
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_debugregs (out)
+Returns: 0 on success, -1 on error
+
+Reads debug registers from the vcpu.
+
+struct kvm_debugregs {
+	__u64 db[4];
+	__u64 dr6;
+	__u64 dr7;
+	__u64 flags;
+	__u64 reserved[9];
+};
+
+4.33 KVM_SET_DEBUGREGS
+
+Capability: KVM_CAP_DEBUGREGS
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_debugregs (in)
+Returns: 0 on success, -1 on error
+
+Writes debug registers into the vcpu.
+
+See KVM_GET_DEBUGREGS for the data structure. The flags field is unused
+yet and must be cleared on entry.
+
+4.34 KVM_SET_USER_MEMORY_REGION
+
+Capability: KVM_CAP_USER_MEM
+Architectures: all
+Type: vm ioctl
+Parameters: struct kvm_userspace_memory_region (in)
+Returns: 0 on success, -1 on error
+
+struct kvm_userspace_memory_region {
+	__u32 slot;
+	__u32 flags;
+	__u64 guest_phys_addr;
+	__u64 memory_size; /* bytes */
+	__u64 userspace_addr; /* start of the userspace allocated memory */
+};
+
+/* for kvm_memory_region::flags */
+#define KVM_MEM_LOG_DIRTY_PAGES  1UL
+
+This ioctl allows the user to create or modify a guest physical memory
+slot.  When changing an existing slot, it may be moved in the guest
+physical memory space, or its flags may be modified.  It may not be
+resized.  Slots may not overlap in guest physical address space.
+
+Memory for the region is taken starting at the address denoted by the
+field userspace_addr, which must point at user addressable memory for
+the entire memory slot size.  Any object may back this memory, including
+anonymous memory, ordinary files, and hugetlbfs.
+
+It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr
+be identical.  This allows large pages in the guest to be backed by large
+pages in the host.
+
+The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which
+instructs kvm to keep track of writes to memory within the slot.  See
+the KVM_GET_DIRTY_LOG ioctl.
+
+When the KVM_CAP_SYNC_MMU capability, changes in the backing of the memory
+region are automatically reflected into the guest.  For example, an mmap()
+that affects the region will be made visible immediately.  Another example
+is madvise(MADV_DROP).
+
+It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl.
+The KVM_SET_MEMORY_REGION does not allow fine grained control over memory
+allocation and is deprecated.
+
+4.35 KVM_SET_TSS_ADDR
+
+Capability: KVM_CAP_SET_TSS_ADDR
+Architectures: x86
+Type: vm ioctl
+Parameters: unsigned long tss_address (in)
+Returns: 0 on success, -1 on error
+
+This ioctl defines the physical address of a three-page region in the guest
+physical address space.  The region must be within the first 4GB of the
+guest physical address space and must not conflict with any memory slot
+or any mmio address.  The guest may malfunction if it accesses this memory
+region.
+
+This ioctl is required on Intel-based hosts.  This is needed on Intel hardware
+because of a quirk in the virtualization implementation (see the internals
+documentation when it pops into existence).
+
+4.36 KVM_ENABLE_CAP
+
+Capability: KVM_CAP_ENABLE_CAP
+Architectures: ppc
+Type: vcpu ioctl
+Parameters: struct kvm_enable_cap (in)
+Returns: 0 on success; -1 on error
+
+Not all extensions are enabled by default. Using this ioctl the application
+can enable an extension, making it available to the guest.
+
+On systems that do not support this ioctl, it always fails. On systems that
+do support it, it only works for extensions that are supported for enablement.
+
+To check if a capability can be enabled, the KVM_CHECK_EXTENSION ioctl should
+be used.
+
+struct kvm_enable_cap {
+       /* in */
+       __u32 cap;
+
+The capability that is supposed to get enabled.
+
+       __u32 flags;
+
+A bitfield indicating future enhancements. Has to be 0 for now.
+
+       __u64 args[4];
+
+Arguments for enabling a feature. If a feature needs initial values to
+function properly, this is the place to put them.
+
+       __u8  pad[64];
+};
+
+4.37 KVM_GET_MP_STATE
+
+Capability: KVM_CAP_MP_STATE
+Architectures: x86, ia64
+Type: vcpu ioctl
+Parameters: struct kvm_mp_state (out)
+Returns: 0 on success; -1 on error
+
+struct kvm_mp_state {
+	__u32 mp_state;
+};
+
+Returns the vcpu's current "multiprocessing state" (though also valid on
+uniprocessor guests).
+
+Possible values are:
+
+ - KVM_MP_STATE_RUNNABLE:        the vcpu is currently running
+ - KVM_MP_STATE_UNINITIALIZED:   the vcpu is an application processor (AP)
+                                 which has not yet received an INIT signal
+ - KVM_MP_STATE_INIT_RECEIVED:   the vcpu has received an INIT signal, and is
+                                 now ready for a SIPI
+ - KVM_MP_STATE_HALTED:          the vcpu has executed a HLT instruction and
+                                 is waiting for an interrupt
+ - KVM_MP_STATE_SIPI_RECEIVED:   the vcpu has just received a SIPI (vector
+                                 accesible via KVM_GET_VCPU_EVENTS)
+
+This ioctl is only useful after KVM_CREATE_IRQCHIP.  Without an in-kernel
+irqchip, the multiprocessing state must be maintained by userspace.
+
+4.38 KVM_SET_MP_STATE
+
+Capability: KVM_CAP_MP_STATE
+Architectures: x86, ia64
+Type: vcpu ioctl
+Parameters: struct kvm_mp_state (in)
+Returns: 0 on success; -1 on error
+
+Sets the vcpu's current "multiprocessing state"; see KVM_GET_MP_STATE for
+arguments.
+
+This ioctl is only useful after KVM_CREATE_IRQCHIP.  Without an in-kernel
+irqchip, the multiprocessing state must be maintained by userspace.

 5. The kvm_run structure

@@ -820,6 +1002,13 @@ executed a memory-mapped I/O instruction which could not be satisfied
 by kvm.  The 'data' member contains the written data if 'is_write' is
 true, and should be filled by application code otherwise.

+NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO and KVM_EXIT_OSI, the corresponding
+operations are complete (and guest state is consistent) only after userspace
+has re-entered the kernel with KVM_RUN.  The kernel side will first finish
+incomplete operations and then check for pending signals.  Userspace
+can re-enter the guest with an unmasked signal pending to complete
+pending operations.
+
 		/* KVM_EXIT_HYPERCALL */
 		struct {
 			__u64 nr;
@@ -829,7 +1018,9 @@ true, and should be filled by application code otherwise.
 			__u32 pad;
 		} hypercall;

-Unused.
+Unused.  This was once used for 'hypercall to userspace'.  To implement
+such functionality, use KVM_EXIT_IO (x86) or KVM_EXIT_MMIO (all except s390).
+Note KVM_EXIT_IO is significantly faster than KVM_EXIT_MMIO.

 		/* KVM_EXIT_TPR_ACCESS */
 		struct {
@@ -870,6 +1061,19 @@ s390 specific.

 powerpc specific.

+		/* KVM_EXIT_OSI */
+		struct {
+			__u64 gprs[32];
+		} osi;
+
+MOL uses a special hypercall interface it calls 'OSI'. To enable it, we catch
+hypercalls and exit with this exit struct that contains all the guest gprs.
+
+If exit_reason is KVM_EXIT_OSI, then the vcpu has triggered such a hypercall.
+Userspace can now handle the hypercall and when it's done modify the gprs as
+necessary. Upon guest entry all guest GPRs will then be replaced by the values
+in this struct.
+
 		/* Fix the size of the union. */
 		char padding[256];
 	};

--- a/Documentation/kvm/cpuid.txt
+++ b/Documentation/kvm/cpuid.txt
+KVM CPUID bits
+Glauber Costa <glommer@redhat.com>, Red Hat Inc, 2010
+=====================================================
+
+A guest running on a kvm host, can check some of its features using
+cpuid. This is not always guaranteed to work, since userspace can
+mask-out some, or even all KVM-related cpuid features before launching
+a guest.
+
+KVM cpuid functions are:
+
+function: KVM_CPUID_SIGNATURE (0x40000000)
+returns : eax = 0,
+          ebx = 0x4b4d564b,
+          ecx = 0x564b4d56,
+          edx = 0x4d.
+Note that this value in ebx, ecx and edx corresponds to the string "KVMKVMKVM".
+This function queries the presence of KVM cpuid leafs.
+
+
+function: define KVM_CPUID_FEATURES (0x40000001)
+returns : ebx, ecx, edx = 0
+          eax = and OR'ed group of (1 << flag), where each flags is:
+
+
+flag                               || value || meaning
+=============================================================================
+KVM_FEATURE_CLOCKSOURCE            ||     0 || kvmclock available at msrs
+                                   ||       || 0x11 and 0x12.
+------------------------------------------------------------------------------
+KVM_FEATURE_NOP_IO_DELAY           ||     1 || not necessary to perform delays
+                                   ||       || on PIO operations.
+------------------------------------------------------------------------------
+KVM_FEATURE_MMU_OP                 ||     2 || deprecated.
+------------------------------------------------------------------------------
+KVM_FEATURE_CLOCKSOURCE2           ||     3 || kvmclock available at msrs
+                                   ||       || 0x4b564d00 and 0x4b564d01
+------------------------------------------------------------------------------
+KVM_FEATURE_CLOCKSOURCE_STABLE_BIT ||    24 || host will warn if no guest-side
+                                   ||       || per-cpu warps are expected in
+                                   ||       || kvmclock.
+------------------------------------------------------------------------------
--- a/Documentation/kvm/mmu.txt
+++ b/Documentation/kvm/mmu.txt
+The x86 kvm shadow mmu
+======================
+
+The mmu (in arch/x86/kvm, files mmu.[ch] and paging_tmpl.h) is responsible
+for presenting a standard x86 mmu to the guest, while translating guest
+physical addresses to host physical addresses.
+
+The mmu code attempts to satisfy the following requirements:
+
+- correctness: the guest should not be able to determine that it is running
+               on an emulated mmu except for timing (we attempt to comply
+               with the specification, not emulate the characteristics of
+               a particular implementation such as tlb size)
+- security:    the guest must not be able to touch host memory not assigned
+               to it
+- performance: minimize the performance penalty imposed by the mmu
+- scaling:     need to scale to large memory and large vcpu guests
+- hardware:    support the full range of x86 virtualization hardware
+- integration: Linux memory management code must be in control of guest memory
+               so that swapping, page migration, page merging, transparent
+               hugepages, and similar features work without change
+- dirty tracking: report writes to guest memory to enable live migration
+               and framebuffer-based displays
+- footprint:   keep the amount of pinned kernel memory low (most memory
+               should be shrinkable)
+- reliablity:  avoid multipage or GFP_ATOMIC allocations
+
+Acronyms
+========
+
+pfn   host page frame number
+hpa   host physical address
+hva   host virtual address
+gfn   guest frame number
+gpa   guest physical address
+gva   guest virtual address
+ngpa  nested guest physical address
+ngva  nested guest virtual address
+pte   page table entry (used also to refer generically to paging structure
+      entries)
+gpte  guest pte (referring to gfns)
+spte  shadow pte (referring to pfns)
+tdp   two dimensional paging (vendor neutral term for NPT and EPT)
+
+Virtual and real hardware supported
+===================================
+
+The mmu supports first-generation mmu hardware, which allows an atomic switch
+of the current paging mode and cr3 during guest entry, as well as
+two-dimensional paging (AMD's NPT and Intel's EPT).  The emulated hardware
+it exposes is the traditional 2/3/4 level x86 mmu, with support for global
+pages, pae, pse, pse36, cr0.wp, and 1GB pages.  Work is in progress to support
+exposing NPT capable hardware on NPT capable hosts.
+
+Translation
+===========
+
+The primary job of the mmu is to program the processor's mmu to translate
+addresses for the guest.  Different translations are required at different
+times:
+
+- when guest paging is disabled, we translate guest physical addresses to
+  host physical addresses (gpa->hpa)
+- when guest paging is enabled, we translate guest virtual addresses, to
+  guest physical addresses, to host physical addresses (gva->gpa->hpa)
+- when the guest launches a guest of its own, we translate nested guest
+  virtual addresses, to nested guest physical addresses, to guest physical
+  addresses, to host physical addresses (ngva->ngpa->gpa->hpa)
+
+The primary challenge is to encode between 1 and 3 translations into hardware
+that support only 1 (traditional) and 2 (tdp) translations.  When the
+number of required translations matches the hardware, the mmu operates in
+direct mode; otherwise it operates in shadow mode (see below).
+
+Memory
+======
+
+Guest memory (gpa) is part of the user address space of the process that is
+using kvm.  Userspace defines the translation between guest addresses and user
+addresses (gpa->hva); note that two gpas may alias to the same gva, but not
+vice versa.
+
+These gvas may be backed using any method available to the host: anonymous
+memory, file backed memory, and device memory.  Memory might be paged by the
+host at any time.
+
+Events
+======
+
+The mmu is driven by events, some from the guest, some from the host.
+
+Guest generated events:
+- writes to control registers (especially cr3)
+- invlpg/invlpga instruction execution
+- access to missing or protected translations
+
+Host generated events:
+- changes in the gpa->hpa translation (either through gpa->hva changes or
+  through hva->hpa changes)
+- memory pressure (the shrinker)
+
+Shadow pages
+============
+
+The principal data structure is the shadow page, 'struct kvm_mmu_page'.  A
+shadow page contains 512 sptes, which can be either leaf or nonleaf sptes.  A
+shadow page may contain a mix of leaf and nonleaf sptes.
+
+A nonleaf spte allows the hardware mmu to reach the leaf pages and
+is not related to a translation directly.  It points to other shadow pages.
+
+A leaf spte corresponds to either one or two translations encoded into
+one paging structure entry.  These are always the lowest level of the
+translation stack, with optional higher level translations left to NPT/EPT.
+Leaf ptes point at guest pages.
+
+The following table shows translations encoded by leaf ptes, with higher-level
+translations in parentheses:
+
+ Non-nested guests:
+  nonpaging:     gpa->hpa
+  paging:        gva->gpa->hpa
+  paging, tdp:   (gva->)gpa->hpa
+ Nested guests:
+  non-tdp:       ngva->gpa->hpa  (*)
+  tdp:           (ngva->)ngpa->gpa->hpa
+
+(*) the guest hypervisor will encode the ngva->gpa translation into its page
+    tables if npt is not present
+
+Shadow pages contain the following information:
+  role.level:
+    The level in the shadow paging hierarchy that this shadow page belongs to.
+    1=4k sptes, 2=2M sptes, 3=1G sptes, etc.
+  role.direct:
+    If set, leaf sptes reachable from this page are for a linear range.
+    Examples include real mode translation, large guest pages backed by small
+    host pages, and gpa->hpa translations when NPT or EPT is active.
+    The linear range starts at (gfn << PAGE_SHIFT) and its size is determined
+    by role.level (2MB for first level, 1GB for second level, 0.5TB for third
+    level, 256TB for fourth level)
+    If clear, this page corresponds to a guest page table denoted by the gfn
+    field.
+  role.quadrant:
+    When role.cr4_pae=0, the guest uses 32-bit gptes while the host uses 64-bit
+    sptes.  That means a guest page table contains more ptes than the host,
+    so multiple shadow pages are needed to shadow one guest page.
+    For first-level shadow pages, role.quadrant can be 0 or 1 and denotes the
+    first or second 512-gpte block in the guest page table.  For second-level
+    page tables, each 32-bit gpte is converted to two 64-bit sptes
+    (since each first-level guest page is shadowed by two first-level
+    shadow pages) so role.quadrant takes values in the range 0..3.  Each
+    quadrant maps 1GB virtual address space.
+  role.access:
+    Inherited guest access permissions in the form uwx.  Note execute
+    permission is positive, not negative.
+  role.invalid:
+    The page is invalid and should not be used.  It is a root page that is
+    currently pinned (by a cpu hardware register pointing to it); once it is
+    unpinned it will be destroyed.
+  role.cr4_pae:
+    Contains the value of cr4.pae for which the page is valid (e.g. whether
+    32-bit or 64-bit gptes are in use).
+  role.cr4_nxe:
+    Contains the value of efer.nxe for which the page is valid.
+  role.cr0_wp:
+    Contains the value of cr0.wp for which the page is valid.
+  gfn:
+    Either the guest page table containing the translations shadowed by this
+    page, or the base page frame for linear translations.  See role.direct.
+  spt:
+    A pageful of 64-bit sptes containing the translations for this page.
+    Accessed by both kvm and hardware.
+    The page pointed to by spt will have its page->private pointing back
+    at the shadow page structure.
+    sptes in spt point either at guest pages, or at lower-level shadow pages.
+    Specifically, if sp1 and sp2 are shadow pages, then sp1->spt[n] may point
+    at __pa(sp2->spt).  sp2 will point back at sp1 through parent_pte.
+    The spt array forms a DAG structure with the shadow page as a node, and
+    guest pages as leaves.
+  gfns:
+    An array of 512 guest frame numbers, one for each present pte.  Used to
+    perform a reverse map from a pte to a gfn.
+  slot_bitmap:
+    A bitmap containing one bit per memory slot.  If the page contains a pte
+    mapping a page from memory slot n, then bit n of slot_bitmap will be set
+    (if a page is aliased among several slots, then it is not guaranteed that
+    all slots will be marked).
+    Used during dirty logging to avoid scanning a shadow page if none if its
+    pages need tracking.
+  root_count:
+    A counter keeping track of how many hardware registers (guest cr3 or
+    pdptrs) are now pointing at the page.  While this counter is nonzero, the
+    page cannot be destroyed.  See role.invalid.
+  multimapped:
+    Whether there exist multiple sptes pointing at this page.
+  parent_pte/parent_ptes:
+    If multimapped is zero, parent_pte points at the single spte that points at
+    this page's spt.  Otherwise, parent_ptes points at a data structure
+    with a list of parent_ptes.
+  unsync:
+    If true, then the translations in this page may not match the guest's
+    translation.  This is equivalent to the state of the tlb when a pte is
+    changed but before the tlb entry is flushed.  Accordingly, unsync ptes
+    are synchronized when the guest executes invlpg or flushes its tlb by
+    other means.  Valid for leaf pages.
+  unsync_children:
+    How many sptes in the page point at pages that are unsync (or have
+    unsynchronized children).
+  unsync_child_bitmap:
+    A bitmap indicating which sptes in spt point (directly or indirectly) at
+    pages that may be unsynchronized.  Used to quickly locate all unsychronized
+    pages reachable from a given page.
+
+Reverse map
+===========
+
+The mmu maintains a reverse mapping whereby all ptes mapping a page can be
+reached given its gfn.  This is used, for example, when swapping out a page.
+
+Synchronized and unsynchronized pages
+=====================================
+
+The guest uses two events to synchronize its tlb and page tables: tlb flushes
+and page invalidations (invlpg).
+
+A tlb flush means that we need to synchronize all sptes reachable from the
+guest's cr3.  This is expensive, so we keep all guest page tables write
+protected, and synchronize sptes to gptes when a gpte is written.
+
+A special case is when a guest page table is reachable from the current
+guest cr3.  In this case, the guest is obliged to issue an invlpg instruction
+before using the translation.  We take advantage of that by removing write
+protection from the guest page, and allowing the guest to modify it freely.
+We synchronize modified gptes when the guest invokes invlpg.  This reduces
+the amount of emulation we have to do when the guest modifies multiple gptes,
+or when the a guest page is no longer used as a page table and is used for
+random guest data.
+
+As a side effect we have to resynchronize all reachable unsynchronized shadow
+pages on a tlb flush.
+
+
+Reaction to events
+==================
+
+- guest page fault (or npt page fault, or ept violation)
+
+This is the most complicated event.  The cause of a page fault can be:
+
+  - a true guest fault (the guest translation won't allow the access) (*)
+  - access to a missing translation
+  - access to a protected translation
+    - when logging dirty pages, memory is write protected
+    - synchronized shadow pages are write protected (*)
+  - access to untranslatable memory (mmio)
+
+  (*) not applicable in direct mode
+
+Handling a page fault is performed as follows:
+
+ - if needed, walk the guest page tables to determine the guest translation
+   (gva->gpa or ngpa->gpa)
+   - if permissions are insufficient, reflect the fault back to the guest
+ - determine the host page
+   - if this is an mmio request, there is no host page; call the emulator
+     to emulate the instruction instead
+ - walk the shadow page table to find the spte for the translation,
+   instantiating missing intermediate page tables as necessary
+ - try to unsynchronize the page
+   - if successful, we can let the guest continue and modify the gpte
+ - emulate the instruction
+   - if failed, unshadow the page and let the guest continue
+ - update any translations that were modified by the instruction
+
+invlpg handling:
+
+  - walk the shadow page hierarchy and drop affected translations
+  - try to reinstantiate the indicated translation in the hope that the
+    guest will use it in the near future
+
+Guest control register updates:
+
+- mov to cr3
+  - look up new shadow roots
+  - synchronize newly reachable shadow pages
+
+- mov to cr0/cr4/efer
+  - set up mmu context for new paging mode
+  - look up new shadow roots
+  - synchronize newly reachable shadow pages
+
+Host translation updates:
+
+  - mmu notifier called with updated hva
+  - look up affected sptes through reverse map
+  - drop (or update) translations
+
+Further reading
+===============
+
+- NPT presentation from KVM Forum 2008
+  http://www.linux-kvm.org/wiki/images/c/c8/KvmForum2008%24kdf2008_21.pdf
+
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -979,11 +979,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = -EFAULT;
 		if (copy_from_user(&irq_event, argp, sizeof irq_event))
 			goto out;
+		r = -ENXIO;
 		if (irqchip_in_kernel(kvm)) {
 			__s32 status;
 			status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
 				    irq_event.irq, irq_event.level);
 			if (ioctl == KVM_IRQ_LINE_STATUS) {
+				r = -EFAULT;
 				irq_event.status = status;
 				if (copy_to_user(argp, &irq_event,
 							sizeof irq_event))
@@ -1379,7 +1381,7 @@ static void kvm_release_vm_pages(struct kvm *kvm)
 	int i, j;
 	unsigned long base_gfn;

-	slots = rcu_dereference(kvm->memslots);
+	slots = kvm_memslots(kvm);
 	for (i = 0; i < slots->nmemslots; i++) {
 		memslot = &slots->memslots[i];
 		base_gfn = memslot->base_gfn;
@@ -1535,8 +1537,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 			goto out;

 		if (copy_to_user(user_stack, stack,
-				 sizeof(struct kvm_ia64_vcpu_stack)))
+				 sizeof(struct kvm_ia64_vcpu_stack))) {
+			r = -EFAULT;
 			goto out;
+		}

 		break;
 	}

--- a/arch/ia64/kvm/vmm.c
+++ b/arch/ia64/kvm/vmm.c
@@ -51,7 +51,7 @@ static int __init  kvm_vmm_init(void)
 	vmm_fpswa_interface = fpswa_interface;

 	/*Register vmm data to kvm side*/
-	return kvm_init(&vmm_info, 1024, THIS_MODULE);
+	return kvm_init(&vmm_info, 1024, 0, THIS_MODULE);
 }

 static void __exit kvm_vmm_exit(void)

--- a/arch/powerpc/include/asm/asm-compat.h
+++ b/arch/powerpc/include/asm/asm-compat.h
@@ -21,6 +21,7 @@
 /* operations for longs and pointers */
 #define PPC_LL		stringify_in_c(ld)
 #define PPC_STL		stringify_in_c(std)
+#define PPC_STLU	stringify_in_c(stdu)
 #define PPC_LCMPI	stringify_in_c(cmpdi)
 #define PPC_LONG	stringify_in_c(.llong)
 #define PPC_LONG_ALIGN	stringify_in_c(.balign 8)
@@ -44,6 +45,7 @@
 /* operations for longs and pointers */
 #define PPC_LL		stringify_in_c(lwz)
 #define PPC_STL		stringify_in_c(stw)
+#define PPC_STLU	stringify_in_c(stwu)
 #define PPC_LCMPI	stringify_in_c(cmpwi)
 #define PPC_LONG	stringify_in_c(.long)
 #define PPC_LONG_ALIGN	stringify_in_c(.balign 4)

--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -77,4 +77,14 @@ struct kvm_debug_exit_arch {
 struct kvm_guest_debug_arch {
 };

+#define KVM_REG_MASK		0x001f
+#define KVM_REG_EXT_MASK	0xffe0
+#define KVM_REG_GPR		0x0000
+#define KVM_REG_FPR		0x0020
+#define KVM_REG_QPR		0x0040
+#define KVM_REG_FQPR		0x0060
+
+#define KVM_INTERRUPT_SET	-1U
+#define KVM_INTERRUPT_UNSET	-2U
+
 #endif /* __LINUX_KVM_POWERPC_H */
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -88,6 +88,8 @@

 #define BOOK3S_HFLAG_DCBZ32			0x1
 #define BOOK3S_HFLAG_SLB			0x2
+#define BOOK3S_HFLAG_PAIRED_SINGLE		0x4
+#define BOOK3S_HFLAG_NATIVE_PS			0x8

 #define RESUME_FLAG_NV          (1<<0)  /* Reload guest nonvolatile state? */
 #define RESUME_FLAG_HOST        (1<<1)  /* Resume host? */

--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -22,46 +22,47 @@

 #include <linux/types.h>
 #include <linux/kvm_host.h>
-#include <asm/kvm_book3s_64_asm.h>
+#include <asm/kvm_book3s_asm.h>

 struct kvmppc_slb {
 	u64 esid;
 	u64 vsid;
 	u64 orige;
 	u64 origv;
-	bool valid;
-	bool Ks;
-	bool Kp;
-	bool nx;
-	bool large;	/* PTEs are 16MB */
-	bool tb;	/* 1TB segment */
-	bool class;
+	bool valid	: 1;
+	bool Ks		: 1;
+	bool Kp		: 1;
+	bool nx		: 1;
+	bool large	: 1;	/* PTEs are 16MB */
+	bool tb		: 1;	/* 1TB segment */
+	bool class	: 1;
 };

 struct kvmppc_sr {
 	u32 raw;
 	u32 vsid;
-	bool Ks;
-	bool Kp;
-	bool nx;
+	bool Ks		: 1;
+	bool Kp		: 1;
+	bool nx		: 1;
+	bool valid	: 1;
 };

 struct kvmppc_bat {
 	u64 raw;
 	u32 bepi;
 	u32 bepi_mask;
-	bool vs;
-	bool vp;
 	u32 brpn;
 	u8 wimg;
 	u8 pp;
+	bool vs		: 1;
+	bool vp		: 1;
 };

 struct kvmppc_sid_map {
 	u64 guest_vsid;
 	u64 guest_esid;
 	u64 host_vsid;
-	bool valid;
+	bool valid	: 1;
 };

 #define SID_MAP_BITS    9
@@ -70,7 +71,7 @@ struct kvmppc_sid_map {

 struct kvmppc_vcpu_book3s {
 	struct kvm_vcpu vcpu;
-	struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
+	struct kvmppc_book3s_shadow_vcpu *shadow_vcpu;
 	struct kvmppc_sid_map sid_map[SID_MAP_NUM];
 	struct kvmppc_slb slb[64];
 	struct {
@@ -82,9 +83,10 @@ struct kvmppc_vcpu_book3s {
 	struct kvmppc_bat ibat[8];
 	struct kvmppc_bat dbat[8];
 	u64 hid[6];
+	u64 gqr[8];
 	int slb_nr;
+	u32 dsisr;
 	u64 sdr1;
-	u64 dsisr;
 	u64 hior;
 	u64 msr_mask;
 	u64 vsid_first;
@@ -98,15 +100,15 @@ struct kvmppc_vcpu_book3s {
 #define CONTEXT_GUEST		1
 #define CONTEXT_GUEST_END	2

-#define VSID_REAL	0xfffffffffff00000
-#define VSID_REAL_DR	0xffffffffffe00000
-#define VSID_REAL_IR	0xffffffffffd00000
-#define VSID_BAT	0xffffffffffc00000
-#define VSID_PR		0x8000000000000000
+#define VSID_REAL	0x1fffffffffc00000ULL
+#define VSID_BAT	0x1fffffffffb00000ULL
+#define VSID_REAL_DR	0x2000000000000000ULL
+#define VSID_REAL_IR	0x4000000000000000ULL
+#define VSID_PR		0x8000000000000000ULL

-extern void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, u64 ea, u64 ea_mask);
+extern void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong ea, ulong ea_mask);
 extern void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 vp, u64 vp_mask);
-extern void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, u64 pa_start, u64 pa_end);
+extern void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end);
 extern void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 new_msr);
 extern void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu);
 extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu);
@@ -114,11 +116,13 @@ extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
 extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
 extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
 extern struct kvmppc_pte *kvmppc_mmu_find_pte(struct kvm_vcpu *vcpu, u64 ea, bool data);
-extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong eaddr, int size, void *ptr, bool data);
-extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong eaddr, int size, void *ptr);
+extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
+extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
 extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
 			   bool upper, u32 val);
+extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
+extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu);

 extern u32 kvmppc_trampoline_lowmem;
 extern u32 kvmppc_trampoline_enter;
@@ -126,6 +130,8 @@ extern void kvmppc_rmcall(ulong srr0, ulong srr1);
 extern void kvmppc_load_up_fpu(void);
 extern void kvmppc_load_up_altivec(void);
 extern void kvmppc_load_up_vsx(void);
+extern u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, unsigned int inst);
+extern ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst);

 static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
 {
@@ -140,7 +146,108 @@ static inline ulong dsisr(void)
 }

 extern void kvm_return_point(void);
+static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu);
+
+static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
+{
+	if ( num < 14 ) {
+		to_svcpu(vcpu)->gpr[num] = val;
+		to_book3s(vcpu)->shadow_vcpu->gpr[num] = val;
+	} else
+		vcpu->arch.gpr[num] = val;
+}
+
+static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
+{
+	if ( num < 14 )
+		return to_svcpu(vcpu)->gpr[num];
+	else
+		return vcpu->arch.gpr[num];
+}
+
+static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
+{
+	to_svcpu(vcpu)->cr = val;
+	to_book3s(vcpu)->shadow_vcpu->cr = val;
+}
+
+static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
+{
+	return to_svcpu(vcpu)->cr;
+}
+
+static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
+{
+	to_svcpu(vcpu)->xer = val;
+	to_book3s(vcpu)->shadow_vcpu->xer = val;
+}
+
+static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
+{
+	return to_svcpu(vcpu)->xer;
+}
+
+static inline void kvmppc_set_ctr(struct kvm_vcpu *vcpu, ulong val)
+{
+	to_svcpu(vcpu)->ctr = val;
+}
+
+static inline ulong kvmppc_get_ctr(struct kvm_vcpu *vcpu)
+{
+	return to_svcpu(vcpu)->ctr;
+}
+
+static inline void kvmppc_set_lr(struct kvm_vcpu *vcpu, ulong val)
+{
+	to_svcpu(vcpu)->lr = val;
+}
+
+static inline ulong kvmppc_get_lr(struct kvm_vcpu *vcpu)
+{
+	return to_svcpu(vcpu)->lr;
+}
+
+static inline void kvmppc_set_pc(struct kvm_vcpu *vcpu, ulong val)
+{
+	to_svcpu(vcpu)->pc = val;
+}
+
+static inline ulong kvmppc_get_pc(struct kvm_vcpu *vcpu)
+{
+	return to_svcpu(vcpu)->pc;
+}
+
+static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
+{
+	ulong pc = kvmppc_get_pc(vcpu);
+	struct kvmppc_book3s_shadow_vcpu *svcpu = to_svcpu(vcpu);
+
+	/* Load the instruction manually if it failed to do so in the
+	 * exit path */
+	if (svcpu->last_inst == KVM_INST_FETCH_FAILED)
+		kvmppc_ld(vcpu, &pc, sizeof(u32), &svcpu->last_inst, false);
+
+	return svcpu->last_inst;
+}
+
+static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
+{
+	return to_svcpu(vcpu)->fault_dar;
+}
+
+/* Magic register values loaded into r3 and r4 before the 'sc' assembly
+ * instruction for the OSI hypercalls */
+#define OSI_SC_MAGIC_R3			0x113724FA
+#define OSI_SC_MAGIC_R4			0x77810F9B

 #define INS_DCBZ			0x7c0007ec

+/* Also add subarch specific defines */
+
+#ifdef CONFIG_PPC_BOOK3S_32
+#include <asm/kvm_book3s_32.h>
+#else
+#include <asm/kvm_book3s_64.h>
+#endif
+
 #endif /* __ASM_KVM_BOOK3S_H__ */
--- a/arch/powerpc/include/asm/kvm_book3s_32.h
+++ b/arch/powerpc/include/asm/kvm_book3s_32.h
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2010
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#ifndef __ASM_KVM_BOOK3S_32_H__
+#define __ASM_KVM_BOOK3S_32_H__
+
+static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu)
+{
+	return to_book3s(vcpu)->shadow_vcpu;
+}
+
+#define PTE_SIZE	12
+#define VSID_ALL	0
+#define SR_INVALID	0x00000001	/* VSID 1 should always be unused */
+#define SR_KP		0x20000000
+#define PTE_V		0x80000000
+#define PTE_SEC		0x00000040
+#define PTE_M		0x00000010
+#define PTE_R		0x00000100
+#define PTE_C		0x00000080
+
+#define SID_SHIFT	28
+#define ESID_MASK	0xf0000000
+#define VSID_MASK	0x00fffffff0000000ULL
+
+#endif /* __ASM_KVM_BOOK3S_32_H__ */
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2010
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#ifndef __ASM_KVM_BOOK3S_64_H__
+#define __ASM_KVM_BOOK3S_64_H__
+
+static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu)
+{
+	return &get_paca()->shadow_vcpu;
+}
+
+#endif /* __ASM_KVM_BOOK3S_64_H__ */
--- a/arch/powerpc/include/asm/kvm_book3s_64_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64_asm.h
@@ -22,7 +22,7 @@

 #ifdef __ASSEMBLY__

-#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#ifdef CONFIG_KVM_BOOK3S_HANDLER

 #include <asm/kvm_asm.h>

@@ -55,7 +55,7 @@ kvmppc_resume_\intno:
 .macro DO_KVM intno
 .endm

-#endif /* CONFIG_KVM_BOOK3S_64_HANDLER */
+#endif /* CONFIG_KVM_BOOK3S_HANDLER */

 #else  /*__ASSEMBLY__ */

@@ -63,12 +63,33 @@ struct kvmppc_book3s_shadow_vcpu {
 	ulong gpr[14];
 	u32 cr;
 	u32 xer;
+
+	u32 fault_dsisr;
+	u32 last_inst;
+	ulong ctr;
+	ulong lr;
+	ulong pc;
+	ulong shadow_srr1;
+	ulong fault_dar;
+
 	ulong host_r1;
 	ulong host_r2;
 	ulong handler;
 	ulong scratch0;
 	ulong scratch1;
 	ulong vmhandler;
+	u8 in_guest;
+
+#ifdef CONFIG_PPC_BOOK3S_32
+	u32     sr[16];			/* Guest SRs */
+#endif
+#ifdef CONFIG_PPC_BOOK3S_64
+	u8 slb_max;			/* highest used guest slb entry */
+	struct  {
+		u64     esid;
+		u64     vsid;
+	} slb[64];			/* guest SLB */
+#endif
 };

 #endif /*__ASSEMBLY__ */

--- a/arch/powerpc/include/asm/kvm_booke.h
+++ b/arch/powerpc/include/asm/kvm_booke.h
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2010
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#ifndef __ASM_KVM_BOOKE_H__
+#define __ASM_KVM_BOOKE_H__
+
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+
+static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
+{
+	vcpu->arch.gpr[num] = val;
+}
+
+static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
+{
+	return vcpu->arch.gpr[num];
+}
+
+static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
+{
+	vcpu->arch.cr = val;
+}
+
+static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.cr;
+}
+
+static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
+{
+	vcpu->arch.xer = val;
+}
+
+static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.xer;
+}
+
+static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.last_inst;
+}
+
+static inline void kvmppc_set_ctr(struct kvm_vcpu *vcpu, ulong val)
+{
+	vcpu->arch.ctr = val;
+}
+
+static inline ulong kvmppc_get_ctr(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.ctr;
+}
+
+static inline void kvmppc_set_lr(struct kvm_vcpu *vcpu, ulong val)
+{
+	vcpu->arch.lr = val;
+}
+
+static inline ulong kvmppc_get_lr(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.lr;
+}
+
+static inline void kvmppc_set_pc(struct kvm_vcpu *vcpu, ulong val)
+{
+	vcpu->arch.pc = val;
+}
+
+static inline ulong kvmppc_get_pc(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.pc;
+}
+
+static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.fault_dear;
+}
+
+#endif /* __ASM_KVM_BOOKE_H__ */
--- a/arch/powerpc/include/asm/kvm_fpu.h
+++ b/arch/powerpc/include/asm/kvm_fpu.h
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright Novell Inc. 2010
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#ifndef __ASM_KVM_FPU_H__
+#define __ASM_KVM_FPU_H__
+
+#include <linux/types.h>
+
+extern void fps_fres(struct thread_struct *t, u32 *dst, u32 *src1);
+extern void fps_frsqrte(struct thread_struct *t, u32 *dst, u32 *src1);
+extern void fps_fsqrts(struct thread_struct *t, u32 *dst, u32 *src1);
+
+extern void fps_fadds(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2);
+extern void fps_fdivs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2);
+extern void fps_fmuls(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2);
+extern void fps_fsubs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2);
+
+extern void fps_fmadds(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2,
+		       u32 *src3);
+extern void fps_fmsubs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2,
+		       u32 *src3);
+extern void fps_fnmadds(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2,
+		        u32 *src3);
+extern void fps_fnmsubs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2,
+		        u32 *src3);
+extern void fps_fsel(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2,
+		     u32 *src3);
+
+#define FPD_ONE_IN(name) extern void fpd_ ## name(u64 *fpscr, u32 *cr, \
+				u64 *dst, u64 *src1);
+#define FPD_TWO_IN(name) extern void fpd_ ## name(u64 *fpscr, u32 *cr, \
+				u64 *dst, u64 *src1, u64 *src2);
+#define FPD_THREE_IN(name) extern void fpd_ ## name(u64 *fpscr, u32 *cr, \
+				u64 *dst, u64 *src1, u64 *src2, u64 *src3);
+
+extern void fpd_fcmpu(u64 *fpscr, u32 *cr, u64 *src1, u64 *src2);
+extern void fpd_fcmpo(u64 *fpscr, u32 *cr, u64 *src1, u64 *src2);
+
+FPD_ONE_IN(fsqrts)
+FPD_ONE_IN(frsqrtes)
+FPD_ONE_IN(fres)
+FPD_ONE_IN(frsp)
+FPD_ONE_IN(fctiw)
+FPD_ONE_IN(fctiwz)
+FPD_ONE_IN(fsqrt)
+FPD_ONE_IN(fre)
+FPD_ONE_IN(frsqrte)
+FPD_ONE_IN(fneg)
+FPD_ONE_IN(fabs)
+FPD_TWO_IN(fadds)
+FPD_TWO_IN(fsubs)
+FPD_TWO_IN(fdivs)
+FPD_TWO_IN(fmuls)
+FPD_TWO_IN(fcpsgn)
+FPD_TWO_IN(fdiv)
+FPD_TWO_IN(fadd)
+FPD_TWO_IN(fmul)
+FPD_TWO_IN(fsub)
+FPD_THREE_IN(fmsubs)
+FPD_THREE_IN(fmadds)
+FPD_THREE_IN(fnmsubs)
+FPD_THREE_IN(fnmadds)
+FPD_THREE_IN(fsel)
+FPD_THREE_IN(fmsub)
+FPD_THREE_IN(fmadd)
+FPD_THREE_IN(fnmsub)
+FPD_THREE_IN(fnmadd)
+
+#endif
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -66,7 +66,7 @@ struct kvm_vcpu_stat {
 	u32 dec_exits;
 	u32 ext_intr_exits;
 	u32 halt_wakeup;
-#ifdef CONFIG_PPC64
+#ifdef CONFIG_PPC_BOOK3S
 	u32 pf_storage;
 	u32 pf_instruc;
 	u32 sp_storage;
@@ -124,12 +124,12 @@ struct kvm_arch {
 };

 struct kvmppc_pte {
-	u64 eaddr;
+	ulong eaddr;
 	u64 vpage;
-	u64 raddr;
-	bool may_read;
-	bool may_write;
-	bool may_execute;
+	ulong raddr;
+	bool may_read		: 1;
+	bool may_write		: 1;
+	bool may_execute	: 1;
 };

 struct kvmppc_mmu {
@@ -145,7 +145,7 @@ struct kvmppc_mmu {
 	int  (*xlate)(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_pte *pte, bool data);
 	void (*reset_msr)(struct kvm_vcpu *vcpu);
 	void (*tlbie)(struct kvm_vcpu *vcpu, ulong addr, bool large);
-	int  (*esid_to_vsid)(struct kvm_vcpu *vcpu, u64 esid, u64 *vsid);
+	int  (*esid_to_vsid)(struct kvm_vcpu *vcpu, ulong esid, u64 *vsid);
 	u64  (*ea_to_vp)(struct kvm_vcpu *vcpu, gva_t eaddr, bool data);
 	bool (*is_dcbz32)(struct kvm_vcpu *vcpu);
 };
@@ -160,7 +160,7 @@ struct hpte_cache {
 struct kvm_vcpu_arch {
 	ulong host_stack;
 	u32 host_pid;
-#ifdef CONFIG_PPC64
+#ifdef CONFIG_PPC_BOOK3S
 	ulong host_msr;
 	ulong host_r2;
 	void *host_retip;
@@ -175,7 +175,7 @@ struct kvm_vcpu_arch {
 	ulong gpr[32];

 	u64 fpr[32];
-	u32 fpscr;
+	u64 fpscr;

 #ifdef CONFIG_ALTIVEC
 	vector128 vr[32];
@@ -186,19 +186,23 @@ struct kvm_vcpu_arch {
 	u64 vsr[32];
 #endif

+#ifdef CONFIG_PPC_BOOK3S
+	/* For Gekko paired singles */
+	u32 qpr[32];
+#endif
+
+#ifdef CONFIG_BOOKE
 	ulong pc;
 	ulong ctr;
 	ulong lr;

-#ifdef CONFIG_BOOKE
 	ulong xer;
 	u32 cr;
 #endif

 	ulong msr;
-#ifdef CONFIG_PPC64
+#ifdef CONFIG_PPC_BOOK3S
 	ulong shadow_msr;
-	ulong shadow_srr1;
 	ulong hflags;
 	ulong guest_owned_ext;
 #endif
@@ -253,20 +257,22 @@ struct kvm_vcpu_arch {
 	struct dentry *debugfs_exit_timing;
 #endif

+#ifdef CONFIG_BOOKE
 	u32 last_inst;
-#ifdef CONFIG_PPC64
-	ulong fault_dsisr;
-#endif
 	ulong fault_dear;
 	ulong fault_esr;
 	ulong queued_dear;
 	ulong queued_esr;
+#endif
 	gpa_t paddr_accessed;

 	u8 io_gpr; /* GPR used as IO source/target */
 	u8 mmio_is_bigendian;
+	u8 mmio_sign_extend;
 	u8 dcr_needed;
 	u8 dcr_is_write;
+	u8 osi_needed;
+	u8 osi_enabled;

 	u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */

@@ -275,7 +281,7 @@ struct kvm_vcpu_arch {
 	u64 dec_jiffies;
 	unsigned long pending_exceptions;

-#ifdef CONFIG_PPC64
+#ifdef CONFIG_PPC_BOOK3S
 	struct hpte_cache hpte_cache[HPTEG_CACHE_NUM];
 	int hpte_cache_offset;
 #endif

--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -30,6 +30,8 @@
 #include <linux/kvm_host.h>
 #ifdef CONFIG_PPC_BOOK3S
 #include <asm/kvm_book3s.h>
+#else
+#include <asm/kvm_booke.h>
 #endif

 enum emulation_result {
@@ -37,6 +39,7 @@ enum emulation_result {
 	EMULATE_DO_MMIO,      /* kvm_run filled with MMIO request */
 	EMULATE_DO_DCR,       /* kvm_run filled with DCR request */
 	EMULATE_FAIL,         /* can't emulate this instruction */
+	EMULATE_AGAIN,        /* something went wrong. go again */
 };

 extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
@@ -48,8 +51,11 @@ extern void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu);
 extern int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
                              unsigned int rt, unsigned int bytes,
                              int is_bigendian);
+extern int kvmppc_handle_loads(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                               unsigned int rt, unsigned int bytes,
+                               int is_bigendian);
 extern int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
-                               u32 val, unsigned int bytes, int is_bigendian);
+                               u64 val, unsigned int bytes, int is_bigendian);

 extern int kvmppc_emulate_instruction(struct kvm_run *run,
                                      struct kvm_vcpu *vcpu);
@@ -63,6 +69,7 @@ extern void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr,
 extern void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode);
 extern void kvmppc_mmu_switch_pid(struct kvm_vcpu *vcpu, u32 pid);
 extern void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu);
+extern int kvmppc_mmu_init(struct kvm_vcpu *vcpu);
 extern int kvmppc_mmu_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr);
 extern int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr);
 extern gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int gtlb_index,
@@ -88,6 +95,8 @@ extern void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu);
 extern void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu);
 extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
                                       struct kvm_interrupt *irq);
+extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
+                                         struct kvm_interrupt *irq);

 extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
                                  unsigned int op, int *advance);
@@ -99,81 +108,37 @@ extern void kvmppc_booke_exit(void);

 extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);

-#ifdef CONFIG_PPC_BOOK3S
-
-/* We assume we're always acting on the current vcpu */
-
-static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
-{
-	if ( num < 14 ) {
-		get_paca()->shadow_vcpu.gpr[num] = val;
-		to_book3s(vcpu)->shadow_vcpu.gpr[num] = val;
-	} else
-		vcpu->arch.gpr[num] = val;
-}
-
-static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
-{
-	if ( num < 14 )
-		return get_paca()->shadow_vcpu.gpr[num];
-	else
-		return vcpu->arch.gpr[num];
-}
-
-static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
-{
-	get_paca()->shadow_vcpu.cr = val;
-	to_book3s(vcpu)->shadow_vcpu.cr = val;
-}
-
-static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
-{
-	return get_paca()->shadow_vcpu.cr;
-}
-
-static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
-{
-	get_paca()->shadow_vcpu.xer = val;
-	to_book3s(vcpu)->shadow_vcpu.xer = val;
-}
-
-static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
+/*
+ * Cuts out inst bits with ordering according to spec.
+ * That means the leftmost bit is zero. All given bits are included.
+ */
+static inline u32 kvmppc_get_field(u64 inst, int msb, int lsb)
 {
-	return get_paca()->shadow_vcpu.xer;
-}
+	u32 r;
+	u32 mask;

-#else
+	BUG_ON(msb > lsb);

-static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
-{
-	vcpu->arch.gpr[num] = val;
-}
+	mask = (1 << (lsb - msb + 1)) - 1;
+	r = (inst >> (63 - lsb)) & mask;

-static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
-{
-	return vcpu->arch.gpr[num];
+	return r;
 }

-static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
+/*
+ * Replaces inst bits with ordering according to spec.
+ */
+static inline u32 kvmppc_set_field(u64 inst, int msb, int lsb, int value)
 {
-	vcpu->arch.cr = val;
-}
+	u32 r;
+	u32 mask;

-static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
-{
-	return vcpu->arch.cr;
-}
+	BUG_ON(msb > lsb);

-static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
-{
-	vcpu->arch.xer = val;
-}
+	mask = ((1 << (lsb - msb + 1)) - 1) << (63 - lsb);
+	r = (inst & ~mask) | ((value << (63 - lsb)) & mask);

-static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
-{
-	return vcpu->arch.xer;
+	return r;
 }

-#endif
-
 #endif /* __POWERPC_KVM_PPC_H__ */
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -27,6 +27,8 @@ extern int __init_new_context(void);
 extern void __destroy_context(int context_id);
 static inline void mmu_context_init(void) { }
 #else
+extern unsigned long __init_new_context(void);
+extern void __destroy_context(unsigned long context_id);
 extern void mmu_context_init(void);
 #endif


--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -23,7 +23,7 @@
 #include <asm/page.h>
 #include <asm/exception-64e.h>
 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-#include <asm/kvm_book3s_64_asm.h>
+#include <asm/kvm_book3s_asm.h>
 #endif

 register struct paca_struct *local_paca asm("r13");
@@ -137,15 +137,9 @@ struct paca_struct {
 	u64 startpurr;			/* PURR/TB value snapshot */
 	u64 startspurr;			/* SPURR value snapshot */

-#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-	struct  {
-		u64     esid;
-		u64     vsid;
-	} kvm_slb[64];			/* guest SLB */
+#ifdef CONFIG_KVM_BOOK3S_HANDLER
 	/* We use this to store guest state in */
 	struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
-	u8 kvm_slb_max;			/* highest used guest slb entry */
-	u8 kvm_in_guest;		/* are we inside the guest? */
 #endif
 };


--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -229,6 +229,9 @@ struct thread_struct {
 	unsigned long	spefscr;	/* SPE & eFP status */
 	int		used_spe;	/* set if process has used spe */
 #endif /* CONFIG_SPE */
+#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
+	void*		kvm_shadow_vcpu; /* KVM internal data */
+#endif /* CONFIG_KVM_BOOK3S_32_HANDLER */
 };

 #define ARCH_MIN_TASKALIGN 16

--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -293,10 +293,12 @@
 #define HID1_ABE	(1<<10)		/* 7450 Address Broadcast Enable */
 #define HID1_PS		(1<<16)		/* 750FX PLL selection */
 #define SPRN_HID2	0x3F8		/* Hardware Implementation Register 2 */
+#define SPRN_HID2_GEKKO	0x398		/* Gekko HID2 Register */
 #define SPRN_IABR	0x3F2	/* Instruction Address Breakpoint Register */
 #define SPRN_IABR2	0x3FA		/* 83xx */
 #define SPRN_IBCR	0x135		/* 83xx Insn Breakpoint Control Reg */
 #define SPRN_HID4	0x3F4		/* 970 HID4 */
+#define SPRN_HID4_GEKKO	0x3F3		/* Gekko HID4 */
 #define SPRN_HID5	0x3F6		/* 970 HID5 */
 #define SPRN_HID6	0x3F9	/* BE HID 6 */
 #define   HID6_LB	(0x0F<<12) /* Concurrent Large Page Modes */
@@ -465,6 +467,14 @@
 #define SPRN_VRSAVE	0x100	/* Vector Register Save Register */
 #define SPRN_XER	0x001	/* Fixed Point Exception Register */

+#define SPRN_MMCR0_GEKKO 0x3B8 /* Gekko Monitor Mode Control Register 0 */
+#define SPRN_MMCR1_GEKKO 0x3BC /* Gekko Monitor Mode Control Register 1 */
+#define SPRN_PMC1_GEKKO  0x3B9 /* Gekko Performance Monitor Control 1 */
+#define SPRN_PMC2_GEKKO  0x3BA /* Gekko Performance Monitor Control 2 */
+#define SPRN_PMC3_GEKKO  0x3BD /* Gekko Performance Monitor Control 3 */
+#define SPRN_PMC4_GEKKO  0x3BE /* Gekko Performance Monitor Control 4 */
+#define SPRN_WPAR_GEKKO  0x399 /* Gekko Write Pipe Address Register */
+
 #define SPRN_SCOMC	0x114	/* SCOM Access Control */
 #define SPRN_SCOMD	0x115	/* SCOM Access DATA */