blob: 3de42d1e475ba0d44e5566c58970ca498b33b9be [file] [log] [blame]
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001/*
2 * Xen mmu operations
3 *
4 * This file contains the various mmu fetch and update operations.
5 * The most important job they must perform is the mapping between the
6 * domain's pfn and the overall machine mfns.
7 *
8 * Xen allows guests to directly update the pagetable, in a controlled
9 * fashion. In other words, the guest modifies the same pagetable
10 * that the CPU actually uses, which eliminates the overhead of having
11 * a separate shadow pagetable.
12 *
13 * In order to allow this, it falls on the guest domain to map its
14 * notion of a "physical" pfn - which is just a domain-local linear
15 * address - into a real "machine address" which the CPU's MMU can
16 * use.
17 *
18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19 * inserted directly into the pagetable. When creating a new
20 * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
21 * when reading the content back with __(pgd|pmd|pte)_val, it converts
22 * the mfn back into a pfn.
23 *
24 * The other constraint is that all pages which make up a pagetable
25 * must be mapped read-only in the guest. This prevents uncontrolled
26 * guest updates to the pagetable. Xen strictly enforces this, and
27 * will disallow any pagetable update which will end up mapping a
28 * pagetable page RW, and will disallow using any writable page as a
29 * pagetable.
30 *
31 * Naively, when loading %cr3 with the base of a new pagetable, Xen
32 * would need to validate the whole pagetable before going on.
33 * Naturally, this is quite slow. The solution is to "pin" a
34 * pagetable, which enforces all the constraints on the pagetable even
35 * when it is not actively in use. This menas that Xen can be assured
36 * that it is still valid when you do load it into %cr3, and doesn't
37 * need to revalidate it.
38 *
39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
40 */
Jeremy Fitzhardingef120f132007-07-17 18:37:06 -070041#include <linux/sched.h>
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070042#include <linux/highmem.h>
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -070043#include <linux/debugfs.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070044#include <linux/bug.h>
Jeremy Fitzhardinged2cb2142010-03-26 15:37:50 -070045#include <linux/vmalloc.h>
Randy Dunlap44408ad2009-05-12 13:31:40 -070046#include <linux/module.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090047#include <linux/gfp.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070048
49#include <asm/pgtable.h>
50#include <asm/tlbflush.h>
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -070051#include <asm/fixmap.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070052#include <asm/mmu_context.h>
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -080053#include <asm/setup.h>
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070054#include <asm/paravirt.h>
Alex Nixon7347b402010-02-19 13:31:06 -050055#include <asm/e820.h>
Jeremy Fitzhardingecbcd79c2008-07-08 15:06:27 -070056#include <asm/linkage.h>
Alex Nixon08bbc9d2009-02-09 12:05:46 -080057#include <asm/page.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070058
59#include <asm/xen/hypercall.h>
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070060#include <asm/xen/hypervisor.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070061
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -080062#include <xen/xen.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070063#include <xen/page.h>
64#include <xen/interface/xen.h>
Stefano Stabellini59151002010-06-17 14:22:52 +010065#include <xen/interface/hvm/hvm_op.h>
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -080066#include <xen/interface/version.h>
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -080067#include <xen/interface/memory.h>
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -080068#include <xen/hvc-console.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070069
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070070#include "multicalls.h"
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070071#include "mmu.h"
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -070072#include "debugfs.h"
73
74#define MMU_UPDATE_HISTO 30
75
Alex Nixon19001c82009-02-09 12:05:46 -080076/*
77 * Protects atomic reservation decrease/increase against concurrent increases.
78 * Also protects non-atomic updates of current_pages and driver_pages, and
79 * balloon lists.
80 */
81DEFINE_SPINLOCK(xen_reservation_lock);
82
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -070083#ifdef CONFIG_XEN_DEBUG_FS
84
85static struct {
86 u32 pgd_update;
87 u32 pgd_update_pinned;
88 u32 pgd_update_batched;
89
90 u32 pud_update;
91 u32 pud_update_pinned;
92 u32 pud_update_batched;
93
94 u32 pmd_update;
95 u32 pmd_update_pinned;
96 u32 pmd_update_batched;
97
98 u32 pte_update;
99 u32 pte_update_pinned;
100 u32 pte_update_batched;
101
102 u32 mmu_update;
103 u32 mmu_update_extended;
104 u32 mmu_update_histo[MMU_UPDATE_HISTO];
105
106 u32 prot_commit;
107 u32 prot_commit_batched;
108
109 u32 set_pte_at;
110 u32 set_pte_at_batched;
111 u32 set_pte_at_pinned;
112 u32 set_pte_at_current;
113 u32 set_pte_at_kernel;
114} mmu_stats;
115
116static u8 zero_stats;
117
118static inline void check_zero(void)
119{
120 if (unlikely(zero_stats)) {
121 memset(&mmu_stats, 0, sizeof(mmu_stats));
122 zero_stats = 0;
123 }
124}
125
126#define ADD_STATS(elem, val) \
127 do { check_zero(); mmu_stats.elem += (val); } while(0)
128
129#else /* !CONFIG_XEN_DEBUG_FS */
130
131#define ADD_STATS(elem, val) do { (void)(val); } while(0)
132
133#endif /* CONFIG_XEN_DEBUG_FS */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700134
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -0800135
136/*
137 * Identity map, in addition to plain kernel map. This needs to be
138 * large enough to allocate page table pages to allocate the rest.
139 * Each page can map 2MB.
140 */
Jeremy Fitzhardinge764f01382010-08-26 16:23:51 -0700141#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4)
142static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -0800143
144#ifdef CONFIG_X86_64
145/* l3 pud for userspace vsyscall mapping */
146static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
147#endif /* CONFIG_X86_64 */
148
149/*
150 * Note about cr3 (pagetable base) values:
151 *
152 * xen_cr3 contains the current logical cr3 value; it contains the
153 * last set cr3. This may not be the current effective cr3, because
154 * its update may be being lazily deferred. However, a vcpu looking
155 * at its own cr3 can use this value knowing that it everything will
156 * be self-consistent.
157 *
158 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
159 * hypercall to set the vcpu cr3 is complete (so it may be a little
160 * out of date, but it will never be set early). If one vcpu is
161 * looking at another vcpu's cr3 value, it should use this variable.
162 */
163DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
164DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
165
166
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700167/*
168 * Just beyond the highest usermode address. STACK_TOP_MAX has a
169 * redzone above it, so round it up to a PGD boundary.
170 */
171#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
172
Jeremy Fitzhardingea2e87522010-08-26 16:08:31 -0700173static unsigned long max_p2m_pfn __read_mostly = MAX_DOMAIN_PAGES;
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700174
Jeremy Fitzhardingea2e87522010-08-26 16:08:31 -0700175#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
176#define TOP_ENTRIES(pages) ((pages) / P2M_ENTRIES_PER_PAGE)
177#define MAX_TOP_ENTRIES TOP_ENTRIES(MAX_DOMAIN_PAGES)
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100178
Jeremy Fitzhardingecf0923e2008-05-26 23:31:20 +0100179/* Placeholder for holes in the address space */
Jeremy Fitzhardingea171ce62010-08-26 15:04:48 -0700180static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_ENTRIES_PER_PAGE);
Jeremy Fitzhardingecf0923e2008-05-26 23:31:20 +0100181
182 /* Array of pointers to pages containing p2m entries */
Jeremy Fitzhardingea2e87522010-08-26 16:08:31 -0700183static RESERVE_BRK_ARRAY(unsigned long *, p2m_top, MAX_TOP_ENTRIES);
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100184
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100185/* Arrays of p2m arrays expressed in mfns used for save/restore */
Jeremy Fitzhardingea2e87522010-08-26 16:08:31 -0700186static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, MAX_TOP_ENTRIES);
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100187
Jeremy Fitzhardingea171ce62010-08-26 15:04:48 -0700188static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn_list,
Jeremy Fitzhardingea2e87522010-08-26 16:08:31 -0700189 (MAX_TOP_ENTRIES / P2M_ENTRIES_PER_PAGE));
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100190
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100191static inline unsigned p2m_top_index(unsigned long pfn)
192{
Jeremy Fitzhardingea2e87522010-08-26 16:08:31 -0700193 BUG_ON(pfn >= max_p2m_pfn);
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100194 return pfn / P2M_ENTRIES_PER_PAGE;
195}
196
197static inline unsigned p2m_index(unsigned long pfn)
198{
199 return pfn % P2M_ENTRIES_PER_PAGE;
200}
201
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100202/* Build the parallel p2m_top_mfn structures */
Ian Campbellfa24ba62009-11-21 11:32:49 +0000203void xen_build_mfn_list_list(void)
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100204{
205 unsigned pfn, idx;
206
Jeremy Fitzhardingea2e87522010-08-26 16:08:31 -0700207 for (pfn = 0; pfn < max_p2m_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100208 unsigned topidx = p2m_top_index(pfn);
209
210 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
211 }
212
Jeremy Fitzhardingea2e87522010-08-26 16:08:31 -0700213 for (idx = 0;
214 idx < TOP_ENTRIES(max_p2m_pfn)/P2M_ENTRIES_PER_PAGE;
215 idx++) {
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100216 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
217 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
218 }
Jeremy Fitzhardingecdaead62009-02-27 15:34:59 -0800219}
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100220
Jeremy Fitzhardingecdaead62009-02-27 15:34:59 -0800221void xen_setup_mfn_list_list(void)
222{
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100223 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
224
225 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
226 virt_to_mfn(p2m_top_mfn_list);
227 HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
228}
229
230/* Set up p2m_top to point to the domain-builder provided p2m pages */
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100231void __init xen_build_dynamic_phys_to_machine(void)
232{
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100233 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
Jeremy Fitzhardinge8006ec32008-05-26 23:31:19 +0100234 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100235 unsigned pfn;
Jeremy Fitzhardingea171ce62010-08-26 15:04:48 -0700236 unsigned i;
237
Jeremy Fitzhardingea2e87522010-08-26 16:08:31 -0700238 max_p2m_pfn = max_pfn;
239
Jeremy Fitzhardingea171ce62010-08-26 15:04:48 -0700240 p2m_missing = extend_brk(sizeof(*p2m_missing) * P2M_ENTRIES_PER_PAGE,
241 PAGE_SIZE);
242 for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
243 p2m_missing[i] = ~0UL;
244
Jeremy Fitzhardingea2e87522010-08-26 16:08:31 -0700245 p2m_top = extend_brk(sizeof(*p2m_top) * TOP_ENTRIES(max_pfn),
Jeremy Fitzhardingea171ce62010-08-26 15:04:48 -0700246 PAGE_SIZE);
Jeremy Fitzhardingea2e87522010-08-26 16:08:31 -0700247 for (i = 0; i < TOP_ENTRIES(max_pfn); i++)
Jeremy Fitzhardingea171ce62010-08-26 15:04:48 -0700248 p2m_top[i] = p2m_missing;
249
Jeremy Fitzhardingea2e87522010-08-26 16:08:31 -0700250 p2m_top_mfn = extend_brk(sizeof(*p2m_top_mfn) * TOP_ENTRIES(max_pfn),
251 PAGE_SIZE);
Jeremy Fitzhardingea171ce62010-08-26 15:04:48 -0700252 p2m_top_mfn_list = extend_brk(sizeof(*p2m_top_mfn_list) *
Jeremy Fitzhardingea2e87522010-08-26 16:08:31 -0700253 (TOP_ENTRIES(max_pfn) / P2M_ENTRIES_PER_PAGE),
Jeremy Fitzhardingea171ce62010-08-26 15:04:48 -0700254 PAGE_SIZE);
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100255
Tejf63c2f22008-12-16 11:56:06 -0800256 for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100257 unsigned topidx = p2m_top_index(pfn);
258
259 p2m_top[topidx] = &mfn_list[pfn];
260 }
Jeremy Fitzhardingecdaead62009-02-27 15:34:59 -0800261
262 xen_build_mfn_list_list();
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100263}
264
265unsigned long get_phys_to_machine(unsigned long pfn)
266{
267 unsigned topidx, idx;
268
Jeremy Fitzhardingea2e87522010-08-26 16:08:31 -0700269 if (unlikely(pfn >= max_p2m_pfn))
Jeremy Fitzhardinge8006ec32008-05-26 23:31:19 +0100270 return INVALID_P2M_ENTRY;
271
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100272 topidx = p2m_top_index(pfn);
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100273 idx = p2m_index(pfn);
274 return p2m_top[topidx][idx];
275}
Ingo Molnar15ce60052008-06-02 13:20:11 +0200276EXPORT_SYMBOL_GPL(get_phys_to_machine);
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100277
Jeremy Fitzhardingee791ca02009-02-26 15:48:33 -0800278/* install a new p2m_top page */
279bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100280{
Jeremy Fitzhardingee791ca02009-02-26 15:48:33 -0800281 unsigned topidx = p2m_top_index(pfn);
282 unsigned long **pfnp, *mfnp;
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100283 unsigned i;
284
Jeremy Fitzhardingee791ca02009-02-26 15:48:33 -0800285 pfnp = &p2m_top[topidx];
286 mfnp = &p2m_top_mfn[topidx];
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100287
Tejf63c2f22008-12-16 11:56:06 -0800288 for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100289 p[i] = INVALID_P2M_ENTRY;
290
Jeremy Fitzhardingee791ca02009-02-26 15:48:33 -0800291 if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) {
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100292 *mfnp = virt_to_mfn(p);
Jeremy Fitzhardingee791ca02009-02-26 15:48:33 -0800293 return true;
294 }
295
296 return false;
297}
298
299static void alloc_p2m(unsigned long pfn)
300{
301 unsigned long *p;
302
303 p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
304 BUG_ON(p == NULL);
305
306 if (!install_p2mtop_page(pfn, p))
307 free_page((unsigned long)p);
308}
309
310/* Try to install p2m mapping; fail if intermediate bits missing */
311bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
312{
313 unsigned topidx, idx;
314
Jeremy Fitzhardingea2e87522010-08-26 16:08:31 -0700315 if (unlikely(pfn >= max_p2m_pfn)) {
Jeremy Fitzhardingee791ca02009-02-26 15:48:33 -0800316 BUG_ON(mfn != INVALID_P2M_ENTRY);
317 return true;
318 }
319
320 topidx = p2m_top_index(pfn);
321 if (p2m_top[topidx] == p2m_missing) {
322 if (mfn == INVALID_P2M_ENTRY)
323 return true;
324 return false;
325 }
326
327 idx = p2m_index(pfn);
328 p2m_top[topidx][idx] = mfn;
329
330 return true;
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100331}
332
333void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
334{
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100335 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
336 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
337 return;
338 }
339
Jeremy Fitzhardingee791ca02009-02-26 15:48:33 -0800340 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
341 alloc_p2m(pfn);
Jeremy Fitzhardinge8006ec32008-05-26 23:31:19 +0100342
Jeremy Fitzhardingee791ca02009-02-26 15:48:33 -0800343 if (!__set_phys_to_machine(pfn, mfn))
344 BUG();
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100345 }
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100346}
347
Jeremy Fitzhardinge9976b392009-02-27 09:19:26 -0800348unsigned long arbitrary_virt_to_mfn(void *vaddr)
349{
350 xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
351
352 return PFN_DOWN(maddr.maddr);
353}
354
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700355xmaddr_t arbitrary_virt_to_machine(void *vaddr)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700356{
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700357 unsigned long address = (unsigned long)vaddr;
Harvey Harrisonda7bfc52008-02-09 23:24:08 +0100358 unsigned int level;
Chris Lalancette9f32d212008-10-23 17:40:25 -0700359 pte_t *pte;
360 unsigned offset;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700361
Chris Lalancette9f32d212008-10-23 17:40:25 -0700362 /*
363 * if the PFN is in the linear mapped vaddr range, we can just use
364 * the (quick) virt_to_machine() p2m lookup
365 */
366 if (virt_addr_valid(vaddr))
367 return virt_to_machine(vaddr);
368
369 /* otherwise we have to do a (slower) full page-table walk */
370
371 pte = lookup_address(address, &level);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700372 BUG_ON(pte == NULL);
Chris Lalancette9f32d212008-10-23 17:40:25 -0700373 offset = address & ~PAGE_MASK;
Jeremy Fitzhardingeebd879e2008-07-08 15:06:54 -0700374 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700375}
376
377void make_lowmem_page_readonly(void *vaddr)
378{
379 pte_t *pte, ptev;
380 unsigned long address = (unsigned long)vaddr;
Harvey Harrisonda7bfc52008-02-09 23:24:08 +0100381 unsigned int level;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700382
Ingo Molnarf0646e42008-01-30 13:33:43 +0100383 pte = lookup_address(address, &level);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700384 BUG_ON(pte == NULL);
385
386 ptev = pte_wrprotect(*pte);
387
388 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
389 BUG();
390}
391
392void make_lowmem_page_readwrite(void *vaddr)
393{
394 pte_t *pte, ptev;
395 unsigned long address = (unsigned long)vaddr;
Harvey Harrisonda7bfc52008-02-09 23:24:08 +0100396 unsigned int level;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700397
Ingo Molnarf0646e42008-01-30 13:33:43 +0100398 pte = lookup_address(address, &level);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700399 BUG_ON(pte == NULL);
400
401 ptev = pte_mkwrite(*pte);
402
403 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
404 BUG();
405}
406
407
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700408static bool xen_page_pinned(void *ptr)
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100409{
410 struct page *page = virt_to_page(ptr);
411
412 return PagePinned(page);
413}
414
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800415static bool xen_iomap_pte(pte_t pte)
416{
Alex Nixon7347b402010-02-19 13:31:06 -0500417 return pte_flags(pte) & _PAGE_IOMAP;
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800418}
419
420static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
421{
422 struct multicall_space mcs;
423 struct mmu_update *u;
424
425 mcs = xen_mc_entry(sizeof(*u));
426 u = mcs.args;
427
428 /* ptep might be kmapped when using 32-bit HIGHPTE */
429 u->ptr = arbitrary_virt_to_machine(ptep).maddr;
430 u->val = pte_val_ma(pteval);
431
432 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_IO);
433
434 xen_mc_issue(PARAVIRT_LAZY_MMU);
435}
436
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700437static void xen_extend_mmu_update(const struct mmu_update *update)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700438{
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700439 struct multicall_space mcs;
440 struct mmu_update *u;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700441
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700442 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
443
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700444 if (mcs.mc != NULL) {
445 ADD_STATS(mmu_update_extended, 1);
446 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
447
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700448 mcs.mc->args[1]++;
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700449
450 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
451 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
452 else
453 ADD_STATS(mmu_update_histo[0], 1);
454 } else {
455 ADD_STATS(mmu_update, 1);
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700456 mcs = __xen_mc_entry(sizeof(*u));
457 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700458 ADD_STATS(mmu_update_histo[1], 1);
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700459 }
460
461 u = mcs.args;
462 *u = *update;
463}
464
465void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
466{
467 struct mmu_update u;
468
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700469 preempt_disable();
470
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700471 xen_mc_batch();
472
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700473 /* ptr may be ioremapped for 64-bit pagetable setup */
474 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700475 u.val = pmd_val_ma(val);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700476 xen_extend_mmu_update(&u);
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700477
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700478 ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
479
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700480 xen_mc_issue(PARAVIRT_LAZY_MMU);
481
482 preempt_enable();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700483}
484
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100485void xen_set_pmd(pmd_t *ptr, pmd_t val)
486{
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700487 ADD_STATS(pmd_update, 1);
488
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100489 /* If page is not pinned, we can just update the entry
490 directly */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700491 if (!xen_page_pinned(ptr)) {
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100492 *ptr = val;
493 return;
494 }
495
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700496 ADD_STATS(pmd_update_pinned, 1);
497
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100498 xen_set_pmd_hyper(ptr, val);
499}
500
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700501/*
502 * Associate a virtual page frame with a given physical page frame
503 * and protection flags for that frame.
504 */
505void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
506{
Jeremy Fitzhardinge836fe2f2008-07-08 15:06:58 -0700507 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700508}
509
510void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
511 pte_t *ptep, pte_t pteval)
512{
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800513 if (xen_iomap_pte(pteval)) {
514 xen_set_iomap_pte(ptep, pteval);
515 goto out;
516 }
517
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700518 ADD_STATS(set_pte_at, 1);
519// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
520 ADD_STATS(set_pte_at_current, mm == current->mm);
521 ADD_STATS(set_pte_at_kernel, mm == &init_mm);
522
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700523 if (mm == current->mm || mm == &init_mm) {
Jeremy Fitzhardinge8965c1c2007-10-16 11:51:29 -0700524 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700525 struct multicall_space mcs;
526 mcs = xen_mc_entry(0);
527
528 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700529 ADD_STATS(set_pte_at_batched, 1);
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700530 xen_mc_issue(PARAVIRT_LAZY_MMU);
Jeremy Fitzhardinge2bd50032008-04-02 10:54:10 -0700531 goto out;
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700532 } else
533 if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
Jeremy Fitzhardinge2bd50032008-04-02 10:54:10 -0700534 goto out;
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700535 }
536 xen_set_pte(ptep, pteval);
Jeremy Fitzhardinge2bd50032008-04-02 10:54:10 -0700537
Jeremy Fitzhardinge2829b442009-02-17 23:53:19 -0800538out: return;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700539}
540
Tejf63c2f22008-12-16 11:56:06 -0800541pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
542 unsigned long addr, pte_t *ptep)
Jeremy Fitzhardingee57778a2008-06-16 04:30:02 -0700543{
544 /* Just return the pte as-is. We preserve the bits on commit */
545 return *ptep;
546}
547
548void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
549 pte_t *ptep, pte_t pte)
550{
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700551 struct mmu_update u;
Jeremy Fitzhardingee57778a2008-06-16 04:30:02 -0700552
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700553 xen_mc_batch();
554
Chris Lalancette9f32d212008-10-23 17:40:25 -0700555 u.ptr = arbitrary_virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700556 u.val = pte_val_ma(pte);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700557 xen_extend_mmu_update(&u);
Jeremy Fitzhardingee57778a2008-06-16 04:30:02 -0700558
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700559 ADD_STATS(prot_commit, 1);
560 ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
561
Jeremy Fitzhardingee57778a2008-06-16 04:30:02 -0700562 xen_mc_issue(PARAVIRT_LAZY_MMU);
563}
564
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700565/* Assume pteval_t is equivalent to all the other *val_t types. */
566static pteval_t pte_mfn_to_pfn(pteval_t val)
567{
568 if (val & _PAGE_PRESENT) {
Jeremy Fitzhardinge59438c92008-07-21 22:59:42 -0700569 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
Jeremy Fitzhardinge77be1fa2008-07-21 22:59:56 -0700570 pteval_t flags = val & PTE_FLAGS_MASK;
Jeremy Fitzhardinged8355ac2008-07-03 22:10:18 -0700571 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700572 }
573
574 return val;
575}
576
577static pteval_t pte_pfn_to_mfn(pteval_t val)
578{
579 if (val & _PAGE_PRESENT) {
Jeremy Fitzhardinge59438c92008-07-21 22:59:42 -0700580 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
Jeremy Fitzhardinge77be1fa2008-07-21 22:59:56 -0700581 pteval_t flags = val & PTE_FLAGS_MASK;
Jeremy Fitzhardinged8355ac2008-07-03 22:10:18 -0700582 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700583 }
584
585 return val;
586}
587
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800588static pteval_t iomap_pte(pteval_t val)
589{
590 if (val & _PAGE_PRESENT) {
591 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
592 pteval_t flags = val & PTE_FLAGS_MASK;
593
594 /* We assume the pte frame number is a MFN, so
595 just use it as-is. */
596 val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
597 }
598
599 return val;
600}
601
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700602pteval_t xen_pte_val(pte_t pte)
603{
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800604 if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP))
605 return pte.pte;
606
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700607 return pte_mfn_to_pfn(pte.pte);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700608}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800609PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700610
611pgdval_t xen_pgd_val(pgd_t pgd)
612{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700613 return pte_mfn_to_pfn(pgd.pgd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700614}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800615PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700616
617pte_t xen_make_pte(pteval_t pte)
618{
Alex Nixon7347b402010-02-19 13:31:06 -0500619 phys_addr_t addr = (pte & PTE_PFN_MASK);
620
621 /*
622 * Unprivileged domains are allowed to do IOMAPpings for
623 * PCI passthrough, but not map ISA space. The ISA
624 * mappings are just dummy local mappings to keep other
625 * parts of the kernel happy.
626 */
627 if (unlikely(pte & _PAGE_IOMAP) &&
628 (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800629 pte = iomap_pte(pte);
Alex Nixon7347b402010-02-19 13:31:06 -0500630 } else {
631 pte &= ~_PAGE_IOMAP;
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800632 pte = pte_pfn_to_mfn(pte);
Alex Nixon7347b402010-02-19 13:31:06 -0500633 }
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800634
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700635 return native_make_pte(pte);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700636}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800637PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700638
639pgd_t xen_make_pgd(pgdval_t pgd)
640{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700641 pgd = pte_pfn_to_mfn(pgd);
642 return native_make_pgd(pgd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700643}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800644PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700645
646pmdval_t xen_pmd_val(pmd_t pmd)
647{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700648 return pte_mfn_to_pfn(pmd.pmd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700649}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800650PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
Jeremy Fitzhardinge28499142008-05-09 12:05:57 +0100651
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100652void xen_set_pud_hyper(pud_t *ptr, pud_t val)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700653{
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700654 struct mmu_update u;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700655
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700656 preempt_disable();
657
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700658 xen_mc_batch();
659
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700660 /* ptr may be ioremapped for 64-bit pagetable setup */
661 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700662 u.val = pud_val_ma(val);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700663 xen_extend_mmu_update(&u);
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700664
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700665 ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
666
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700667 xen_mc_issue(PARAVIRT_LAZY_MMU);
668
669 preempt_enable();
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700670}
671
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100672void xen_set_pud(pud_t *ptr, pud_t val)
673{
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700674 ADD_STATS(pud_update, 1);
675
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100676 /* If page is not pinned, we can just update the entry
677 directly */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700678 if (!xen_page_pinned(ptr)) {
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100679 *ptr = val;
680 return;
681 }
682
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700683 ADD_STATS(pud_update_pinned, 1);
684
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100685 xen_set_pud_hyper(ptr, val);
686}
687
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700688void xen_set_pte(pte_t *ptep, pte_t pte)
689{
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800690 if (xen_iomap_pte(pte)) {
691 xen_set_iomap_pte(ptep, pte);
692 return;
693 }
694
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700695 ADD_STATS(pte_update, 1);
696// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
697 ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
698
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700699#ifdef CONFIG_X86_PAE
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700700 ptep->pte_high = pte.pte_high;
701 smp_wmb();
702 ptep->pte_low = pte.pte_low;
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700703#else
704 *ptep = pte;
705#endif
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700706}
707
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700708#ifdef CONFIG_X86_PAE
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700709void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
710{
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800711 if (xen_iomap_pte(pte)) {
712 xen_set_iomap_pte(ptep, pte);
713 return;
714 }
715
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700716 set_64bit((u64 *)ptep, native_pte_val(pte));
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700717}
718
719void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
720{
721 ptep->pte_low = 0;
722 smp_wmb(); /* make sure low gets written first */
723 ptep->pte_high = 0;
724}
725
726void xen_pmd_clear(pmd_t *pmdp)
727{
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100728 set_pmd(pmdp, __pmd(0));
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700729}
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700730#endif /* CONFIG_X86_PAE */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700731
Jeremy Fitzhardingeabf33032008-03-17 16:37:07 -0700732pmd_t xen_make_pmd(pmdval_t pmd)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700733{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700734 pmd = pte_pfn_to_mfn(pmd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700735 return native_make_pmd(pmd);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700736}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800737PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700738
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700739#if PAGETABLE_LEVELS == 4
740pudval_t xen_pud_val(pud_t pud)
741{
742 return pte_mfn_to_pfn(pud.pud);
743}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800744PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700745
746pud_t xen_make_pud(pudval_t pud)
747{
748 pud = pte_pfn_to_mfn(pud);
749
750 return native_make_pud(pud);
751}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800752PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700753
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700754pgd_t *xen_get_user_pgd(pgd_t *pgd)
755{
756 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
757 unsigned offset = pgd - pgd_page;
758 pgd_t *user_ptr = NULL;
759
760 if (offset < pgd_index(USER_LIMIT)) {
761 struct page *page = virt_to_page(pgd_page);
762 user_ptr = (pgd_t *)page->private;
763 if (user_ptr)
764 user_ptr += offset;
765 }
766
767 return user_ptr;
768}
769
770static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700771{
772 struct mmu_update u;
773
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700774 u.ptr = virt_to_machine(ptr).maddr;
775 u.val = pgd_val_ma(val);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700776 xen_extend_mmu_update(&u);
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700777}
778
779/*
780 * Raw hypercall-based set_pgd, intended for in early boot before
781 * there's a page structure. This implies:
782 * 1. The only existing pagetable is the kernel's
783 * 2. It is always pinned
784 * 3. It has no user pagetable attached to it
785 */
786void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
787{
788 preempt_disable();
789
790 xen_mc_batch();
791
792 __xen_set_pgd_hyper(ptr, val);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700793
794 xen_mc_issue(PARAVIRT_LAZY_MMU);
795
796 preempt_enable();
797}
798
799void xen_set_pgd(pgd_t *ptr, pgd_t val)
800{
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700801 pgd_t *user_ptr = xen_get_user_pgd(ptr);
802
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700803 ADD_STATS(pgd_update, 1);
804
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700805 /* If page is not pinned, we can just update the entry
806 directly */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700807 if (!xen_page_pinned(ptr)) {
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700808 *ptr = val;
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700809 if (user_ptr) {
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700810 WARN_ON(xen_page_pinned(user_ptr));
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700811 *user_ptr = val;
812 }
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700813 return;
814 }
815
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700816 ADD_STATS(pgd_update_pinned, 1);
817 ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
818
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700819 /* If it's pinned, then we can at least batch the kernel and
820 user updates together. */
821 xen_mc_batch();
822
823 __xen_set_pgd_hyper(ptr, val);
824 if (user_ptr)
825 __xen_set_pgd_hyper(user_ptr, val);
826
827 xen_mc_issue(PARAVIRT_LAZY_MMU);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700828}
829#endif /* PAGETABLE_LEVELS == 4 */
830
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700831/*
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700832 * (Yet another) pagetable walker. This one is intended for pinning a
833 * pagetable. This means that it walks a pagetable and calls the
834 * callback function on each page it finds making up the page table,
835 * at every level. It walks the entire pagetable, but it only bothers
836 * pinning pte pages which are below limit. In the normal case this
837 * will be STACK_TOP_MAX, but at boot we need to pin up to
838 * FIXADDR_TOP.
839 *
840 * For 32-bit the important bit is that we don't pin beyond there,
841 * because then we start getting into Xen's ptes.
842 *
843 * For 64-bit, we must skip the Xen hole in the middle of the address
844 * space, just after the big x86-64 virtual hole.
845 */
Ian Campbell86bbc2c2008-11-21 10:21:33 +0000846static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
847 int (*func)(struct mm_struct *mm, struct page *,
848 enum pt_level),
849 unsigned long limit)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700850{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700851 int flush = 0;
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700852 unsigned hole_low, hole_high;
853 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
854 unsigned pgdidx, pudidx, pmdidx;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700855
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700856 /* The limit is the last byte to be touched */
857 limit--;
858 BUG_ON(limit >= FIXADDR_TOP);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700859
860 if (xen_feature(XENFEAT_auto_translated_physmap))
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700861 return 0;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700862
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700863 /*
864 * 64-bit has a great big hole in the middle of the address
865 * space, which contains the Xen mappings. On 32-bit these
866 * will end up making a zero-sized hole and so is a no-op.
867 */
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700868 hole_low = pgd_index(USER_LIMIT);
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700869 hole_high = pgd_index(PAGE_OFFSET);
870
871 pgdidx_limit = pgd_index(limit);
872#if PTRS_PER_PUD > 1
873 pudidx_limit = pud_index(limit);
874#else
875 pudidx_limit = 0;
876#endif
877#if PTRS_PER_PMD > 1
878 pmdidx_limit = pmd_index(limit);
879#else
880 pmdidx_limit = 0;
881#endif
882
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700883 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700884 pud_t *pud;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700885
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700886 if (pgdidx >= hole_low && pgdidx < hole_high)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700887 continue;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700888
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700889 if (!pgd_val(pgd[pgdidx]))
890 continue;
891
892 pud = pud_offset(&pgd[pgdidx], 0);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700893
894 if (PTRS_PER_PUD > 1) /* not folded */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700895 flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700896
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700897 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700898 pmd_t *pmd;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700899
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700900 if (pgdidx == pgdidx_limit &&
901 pudidx > pudidx_limit)
902 goto out;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700903
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700904 if (pud_none(pud[pudidx]))
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700905 continue;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700906
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700907 pmd = pmd_offset(&pud[pudidx], 0);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700908
909 if (PTRS_PER_PMD > 1) /* not folded */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700910 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700911
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700912 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
913 struct page *pte;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700914
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700915 if (pgdidx == pgdidx_limit &&
916 pudidx == pudidx_limit &&
917 pmdidx > pmdidx_limit)
918 goto out;
919
920 if (pmd_none(pmd[pmdidx]))
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700921 continue;
922
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700923 pte = pmd_page(pmd[pmdidx]);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700924 flush |= (*func)(mm, pte, PT_PTE);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700925 }
926 }
927 }
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700928
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700929out:
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700930 /* Do the top level last, so that the callbacks can use it as
931 a cue to do final things like tlb flushes. */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700932 flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700933
934 return flush;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700935}
936
Ian Campbell86bbc2c2008-11-21 10:21:33 +0000937static int xen_pgd_walk(struct mm_struct *mm,
938 int (*func)(struct mm_struct *mm, struct page *,
939 enum pt_level),
940 unsigned long limit)
941{
942 return __xen_pgd_walk(mm, mm->pgd, func, limit);
943}
944
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700945/* If we're using split pte locks, then take the page's lock and
946 return a pointer to it. Otherwise return NULL. */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700947static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700948{
949 spinlock_t *ptl = NULL;
950
Jeremy Fitzhardingef7d0b922008-09-09 15:43:22 -0700951#if USE_SPLIT_PTLOCKS
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700952 ptl = __pte_lockptr(page);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700953 spin_lock_nest_lock(ptl, &mm->page_table_lock);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700954#endif
955
956 return ptl;
957}
958
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700959static void xen_pte_unlock(void *v)
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700960{
961 spinlock_t *ptl = v;
962 spin_unlock(ptl);
963}
964
965static void xen_do_pin(unsigned level, unsigned long pfn)
966{
967 struct mmuext_op *op;
968 struct multicall_space mcs;
969
970 mcs = __xen_mc_entry(sizeof(*op));
971 op = mcs.args;
972 op->cmd = level;
973 op->arg1.mfn = pfn_to_mfn(pfn);
974 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
975}
976
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700977static int xen_pin_page(struct mm_struct *mm, struct page *page,
978 enum pt_level level)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700979{
Christoph Lameterd60cd462008-04-28 02:12:51 -0700980 unsigned pgfl = TestSetPagePinned(page);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700981 int flush;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700982
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700983 if (pgfl)
984 flush = 0; /* already pinned */
985 else if (PageHighMem(page))
986 /* kmaps need flushing if we found an unpinned
987 highpage */
988 flush = 1;
989 else {
990 void *pt = lowmem_page_address(page);
991 unsigned long pfn = page_to_pfn(page);
992 struct multicall_space mcs = __xen_mc_entry(0);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700993 spinlock_t *ptl;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700994
995 flush = 0;
996
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700997 /*
998 * We need to hold the pagetable lock between the time
999 * we make the pagetable RO and when we actually pin
1000 * it. If we don't, then other users may come in and
1001 * attempt to update the pagetable by writing it,
1002 * which will fail because the memory is RO but not
1003 * pinned, so Xen won't do the trap'n'emulate.
1004 *
1005 * If we're using split pte locks, we can't hold the
1006 * entire pagetable's worth of locks during the
1007 * traverse, because we may wrap the preempt count (8
1008 * bits). The solution is to mark RO and pin each PTE
1009 * page while holding the lock. This means the number
1010 * of locks we end up holding is never more than a
1011 * batch size (~32 entries, at present).
1012 *
1013 * If we're not using split pte locks, we needn't pin
1014 * the PTE pages independently, because we're
1015 * protected by the overall pagetable lock.
1016 */
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001017 ptl = NULL;
1018 if (level == PT_PTE)
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001019 ptl = xen_pte_lock(page, mm);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001020
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001021 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
1022 pfn_pte(pfn, PAGE_KERNEL_RO),
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001023 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
1024
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -07001025 if (ptl) {
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001026 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
1027
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001028 /* Queue a deferred unlock for when this batch
1029 is completed. */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001030 xen_mc_callback(xen_pte_unlock, ptl);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001031 }
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001032 }
1033
1034 return flush;
1035}
1036
1037/* This is called just after a mm has been created, but it has not
1038 been used yet. We need to make sure that its pagetable is all
1039 read-only, and can be pinned. */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001040static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001041{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001042 xen_mc_batch();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001043
Ian Campbell86bbc2c2008-11-21 10:21:33 +00001044 if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
Jeremy Fitzhardinged05fdf32008-10-28 19:23:06 +11001045 /* re-enable interrupts for flushing */
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001046 xen_mc_issue(0);
Jeremy Fitzhardinged05fdf32008-10-28 19:23:06 +11001047
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001048 kmap_flush_unused();
Jeremy Fitzhardinged05fdf32008-10-28 19:23:06 +11001049
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001050 xen_mc_batch();
1051 }
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001052
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -07001053#ifdef CONFIG_X86_64
1054 {
1055 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1056
1057 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
1058
1059 if (user_pgd) {
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001060 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
Tejf63c2f22008-12-16 11:56:06 -08001061 xen_do_pin(MMUEXT_PIN_L4_TABLE,
1062 PFN_DOWN(__pa(user_pgd)));
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -07001063 }
1064 }
1065#else /* CONFIG_X86_32 */
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001066#ifdef CONFIG_X86_PAE
1067 /* Need to make sure unshared kernel PMD is pinnable */
Jeremy Fitzhardinge47cb2ed2008-11-06 13:48:24 -08001068 xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001069 PT_PMD);
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001070#endif
Jeremy Fitzhardinge28499142008-05-09 12:05:57 +01001071 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -07001072#endif /* CONFIG_X86_64 */
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001073 xen_mc_issue(0);
1074}
1075
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001076static void xen_pgd_pin(struct mm_struct *mm)
1077{
1078 __xen_pgd_pin(mm, mm->pgd);
1079}
1080
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001081/*
1082 * On save, we need to pin all pagetables to make sure they get their
1083 * mfns turned into pfns. Search the list for any unpinned pgds and pin
1084 * them (unpinned pgds are not currently in use, probably because the
1085 * process is under construction or destruction).
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001086 *
1087 * Expected to be called in stop_machine() ("equivalent to taking
1088 * every spinlock in the system"), so the locking doesn't really
1089 * matter all that much.
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001090 */
1091void xen_mm_pin_all(void)
1092{
1093 unsigned long flags;
1094 struct page *page;
1095
1096 spin_lock_irqsave(&pgd_lock, flags);
1097
1098 list_for_each_entry(page, &pgd_list, lru) {
1099 if (!PagePinned(page)) {
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001100 __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001101 SetPageSavePinned(page);
1102 }
1103 }
1104
1105 spin_unlock_irqrestore(&pgd_lock, flags);
1106}
1107
Eduardo Habkostc1f2f092008-07-08 15:06:24 -07001108/*
1109 * The init_mm pagetable is really pinned as soon as its created, but
1110 * that's before we have page structures to store the bits. So do all
1111 * the book-keeping now.
1112 */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001113static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
1114 enum pt_level level)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001115{
1116 SetPagePinned(page);
1117 return 0;
1118}
1119
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001120static void __init xen_mark_init_mm_pinned(void)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001121{
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001122 xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001123}
1124
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001125static int xen_unpin_page(struct mm_struct *mm, struct page *page,
1126 enum pt_level level)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001127{
Christoph Lameterd60cd462008-04-28 02:12:51 -07001128 unsigned pgfl = TestClearPagePinned(page);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001129
1130 if (pgfl && !PageHighMem(page)) {
1131 void *pt = lowmem_page_address(page);
1132 unsigned long pfn = page_to_pfn(page);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001133 spinlock_t *ptl = NULL;
1134 struct multicall_space mcs;
1135
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -07001136 /*
1137 * Do the converse to pin_page. If we're using split
1138 * pte locks, we must be holding the lock for while
1139 * the pte page is unpinned but still RO to prevent
1140 * concurrent updates from seeing it in this
1141 * partially-pinned state.
1142 */
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001143 if (level == PT_PTE) {
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001144 ptl = xen_pte_lock(page, mm);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001145
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -07001146 if (ptl)
1147 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001148 }
1149
1150 mcs = __xen_mc_entry(0);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001151
1152 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
1153 pfn_pte(pfn, PAGE_KERNEL),
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001154 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
1155
1156 if (ptl) {
1157 /* unlock when batch completed */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001158 xen_mc_callback(xen_pte_unlock, ptl);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001159 }
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001160 }
1161
1162 return 0; /* never need to flush on unpin */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001163}
1164
1165/* Release a pagetables pages back as normal RW */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001166static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001167{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001168 xen_mc_batch();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001169
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001170 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001171
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -07001172#ifdef CONFIG_X86_64
1173 {
1174 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1175
1176 if (user_pgd) {
Tejf63c2f22008-12-16 11:56:06 -08001177 xen_do_pin(MMUEXT_UNPIN_TABLE,
1178 PFN_DOWN(__pa(user_pgd)));
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001179 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -07001180 }
1181 }
1182#endif
1183
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001184#ifdef CONFIG_X86_PAE
1185 /* Need to make sure unshared kernel PMD is unpinned */
Jeremy Fitzhardinge47cb2ed2008-11-06 13:48:24 -08001186 xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001187 PT_PMD);
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001188#endif
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -07001189
Ian Campbell86bbc2c2008-11-21 10:21:33 +00001190 __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001191
1192 xen_mc_issue(0);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001193}
1194
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001195static void xen_pgd_unpin(struct mm_struct *mm)
1196{
1197 __xen_pgd_unpin(mm, mm->pgd);
1198}
1199
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001200/*
1201 * On resume, undo any pinning done at save, so that the rest of the
1202 * kernel doesn't see any unexpected pinned pagetables.
1203 */
1204void xen_mm_unpin_all(void)
1205{
1206 unsigned long flags;
1207 struct page *page;
1208
1209 spin_lock_irqsave(&pgd_lock, flags);
1210
1211 list_for_each_entry(page, &pgd_list, lru) {
1212 if (PageSavePinned(page)) {
1213 BUG_ON(!PagePinned(page));
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001214 __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001215 ClearPageSavePinned(page);
1216 }
1217 }
1218
1219 spin_unlock_irqrestore(&pgd_lock, flags);
1220}
1221
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001222void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
1223{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001224 spin_lock(&next->page_table_lock);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001225 xen_pgd_pin(next);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001226 spin_unlock(&next->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001227}
1228
1229void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
1230{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001231 spin_lock(&mm->page_table_lock);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001232 xen_pgd_pin(mm);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001233 spin_unlock(&mm->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001234}
1235
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001236
1237#ifdef CONFIG_SMP
1238/* Another cpu may still have their %cr3 pointing at the pagetable, so
1239 we need to repoint it somewhere else before we can unpin it. */
1240static void drop_other_mm_ref(void *info)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001241{
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001242 struct mm_struct *mm = info;
Jeremy Fitzhardingece87b3d2008-07-08 15:06:40 -07001243 struct mm_struct *active_mm;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001244
Brian Gerst9eb912d2009-01-19 00:38:57 +09001245 active_mm = percpu_read(cpu_tlbstate.active_mm);
Jeremy Fitzhardingece87b3d2008-07-08 15:06:40 -07001246
1247 if (active_mm == mm)
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001248 leave_mm(smp_processor_id());
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001249
1250 /* If this cpu still has a stale cr3 reference, then make sure
1251 it has been flushed. */
Jeremy Fitzhardinge7fd7d832009-02-17 23:24:03 -08001252 if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001253 load_cr3(swapper_pg_dir);
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001254}
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001255
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001256static void xen_drop_mm_ref(struct mm_struct *mm)
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001257{
Mike Travise4d98202008-12-16 17:34:05 -08001258 cpumask_var_t mask;
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001259 unsigned cpu;
1260
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001261 if (current->active_mm == mm) {
1262 if (current->mm == mm)
1263 load_cr3(swapper_pg_dir);
1264 else
1265 leave_mm(smp_processor_id());
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001266 }
1267
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001268 /* Get the "official" set of cpus referring to our pagetable. */
Mike Travise4d98202008-12-16 17:34:05 -08001269 if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1270 for_each_online_cpu(cpu) {
Rusty Russell78f1c4d2009-09-24 09:34:51 -06001271 if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
Mike Travise4d98202008-12-16 17:34:05 -08001272 && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1273 continue;
1274 smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1275 }
1276 return;
1277 }
Rusty Russell78f1c4d2009-09-24 09:34:51 -06001278 cpumask_copy(mask, mm_cpumask(mm));
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001279
1280 /* It's possible that a vcpu may have a stale reference to our
1281 cr3, because its in lazy mode, and it hasn't yet flushed
1282 its set of pending hypercalls yet. In this case, we can
1283 look at its actual current cr3 value, and force it to flush
1284 if needed. */
1285 for_each_online_cpu(cpu) {
1286 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
Mike Travise4d98202008-12-16 17:34:05 -08001287 cpumask_set_cpu(cpu, mask);
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001288 }
1289
Mike Travise4d98202008-12-16 17:34:05 -08001290 if (!cpumask_empty(mask))
1291 smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1292 free_cpumask_var(mask);
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001293}
1294#else
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001295static void xen_drop_mm_ref(struct mm_struct *mm)
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001296{
1297 if (current->active_mm == mm)
1298 load_cr3(swapper_pg_dir);
1299}
1300#endif
1301
1302/*
1303 * While a process runs, Xen pins its pagetables, which means that the
1304 * hypervisor forces it to be read-only, and it controls all updates
1305 * to it. This means that all pagetable updates have to go via the
1306 * hypervisor, which is moderately expensive.
1307 *
1308 * Since we're pulling the pagetable down, we switch to use init_mm,
1309 * unpin old process pagetable and mark it all read-write, which
1310 * allows further operations on it to be simple memory accesses.
1311 *
1312 * The only subtle point is that another CPU may be still using the
1313 * pagetable because of lazy tlb flushing. This means we need need to
1314 * switch all CPUs off this pagetable before we can unpin it.
1315 */
1316void xen_exit_mmap(struct mm_struct *mm)
1317{
1318 get_cpu(); /* make sure we don't move around */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001319 xen_drop_mm_ref(mm);
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001320 put_cpu();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001321
Jeremy Fitzhardingef120f132007-07-17 18:37:06 -07001322 spin_lock(&mm->page_table_lock);
Jeremy Fitzhardingedf912ea2007-09-25 11:50:00 -07001323
1324 /* pgd may not be pinned in the error exit path of execve */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001325 if (xen_page_pinned(mm->pgd))
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001326 xen_pgd_unpin(mm);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001327
Jeremy Fitzhardingef120f132007-07-17 18:37:06 -07001328 spin_unlock(&mm->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001329}
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -07001330
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001331static __init void xen_pagetable_setup_start(pgd_t *base)
1332{
1333}
1334
Thomas Gleixnerf1d70622009-08-20 13:13:52 +02001335static void xen_post_allocator_init(void);
1336
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001337static __init void xen_pagetable_setup_done(pgd_t *base)
1338{
1339 xen_setup_shared_info();
Thomas Gleixnerf1d70622009-08-20 13:13:52 +02001340 xen_post_allocator_init();
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001341}
1342
1343static void xen_write_cr2(unsigned long cr2)
1344{
1345 percpu_read(xen_vcpu)->arch.cr2 = cr2;
1346}
1347
1348static unsigned long xen_read_cr2(void)
1349{
1350 return percpu_read(xen_vcpu)->arch.cr2;
1351}
1352
1353unsigned long xen_read_cr2_direct(void)
1354{
1355 return percpu_read(xen_vcpu_info.arch.cr2);
1356}
1357
1358static void xen_flush_tlb(void)
1359{
1360 struct mmuext_op *op;
1361 struct multicall_space mcs;
1362
1363 preempt_disable();
1364
1365 mcs = xen_mc_entry(sizeof(*op));
1366
1367 op = mcs.args;
1368 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1369 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1370
1371 xen_mc_issue(PARAVIRT_LAZY_MMU);
1372
1373 preempt_enable();
1374}
1375
1376static void xen_flush_tlb_single(unsigned long addr)
1377{
1378 struct mmuext_op *op;
1379 struct multicall_space mcs;
1380
1381 preempt_disable();
1382
1383 mcs = xen_mc_entry(sizeof(*op));
1384 op = mcs.args;
1385 op->cmd = MMUEXT_INVLPG_LOCAL;
1386 op->arg1.linear_addr = addr & PAGE_MASK;
1387 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1388
1389 xen_mc_issue(PARAVIRT_LAZY_MMU);
1390
1391 preempt_enable();
1392}
1393
1394static void xen_flush_tlb_others(const struct cpumask *cpus,
1395 struct mm_struct *mm, unsigned long va)
1396{
1397 struct {
1398 struct mmuext_op op;
1399 DECLARE_BITMAP(mask, NR_CPUS);
1400 } *args;
1401 struct multicall_space mcs;
1402
Jeremy Fitzhardingee3f8a742009-03-04 17:36:57 -08001403 if (cpumask_empty(cpus))
1404 return; /* nothing to do */
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001405
1406 mcs = xen_mc_entry(sizeof(*args));
1407 args = mcs.args;
1408 args->op.arg2.vcpumask = to_cpumask(args->mask);
1409
1410 /* Remove us, and any offline CPUS. */
1411 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1412 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001413
1414 if (va == TLB_FLUSH_ALL) {
1415 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1416 } else {
1417 args->op.cmd = MMUEXT_INVLPG_MULTI;
1418 args->op.arg1.linear_addr = va;
1419 }
1420
1421 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1422
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001423 xen_mc_issue(PARAVIRT_LAZY_MMU);
1424}
1425
1426static unsigned long xen_read_cr3(void)
1427{
1428 return percpu_read(xen_cr3);
1429}
1430
1431static void set_current_cr3(void *v)
1432{
1433 percpu_write(xen_current_cr3, (unsigned long)v);
1434}
1435
1436static void __xen_write_cr3(bool kernel, unsigned long cr3)
1437{
1438 struct mmuext_op *op;
1439 struct multicall_space mcs;
1440 unsigned long mfn;
1441
1442 if (cr3)
1443 mfn = pfn_to_mfn(PFN_DOWN(cr3));
1444 else
1445 mfn = 0;
1446
1447 WARN_ON(mfn == 0 && kernel);
1448
1449 mcs = __xen_mc_entry(sizeof(*op));
1450
1451 op = mcs.args;
1452 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1453 op->arg1.mfn = mfn;
1454
1455 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1456
1457 if (kernel) {
1458 percpu_write(xen_cr3, cr3);
1459
1460 /* Update xen_current_cr3 once the batch has actually
1461 been submitted. */
1462 xen_mc_callback(set_current_cr3, (void *)cr3);
1463 }
1464}
1465
1466static void xen_write_cr3(unsigned long cr3)
1467{
1468 BUG_ON(preemptible());
1469
1470 xen_mc_batch(); /* disables interrupts */
1471
1472 /* Update while interrupts are disabled, so its atomic with
1473 respect to ipis */
1474 percpu_write(xen_cr3, cr3);
1475
1476 __xen_write_cr3(true, cr3);
1477
1478#ifdef CONFIG_X86_64
1479 {
1480 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1481 if (user_pgd)
1482 __xen_write_cr3(false, __pa(user_pgd));
1483 else
1484 __xen_write_cr3(false, 0);
1485 }
1486#endif
1487
1488 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
1489}
1490
1491static int xen_pgd_alloc(struct mm_struct *mm)
1492{
1493 pgd_t *pgd = mm->pgd;
1494 int ret = 0;
1495
1496 BUG_ON(PagePinned(virt_to_page(pgd)));
1497
1498#ifdef CONFIG_X86_64
1499 {
1500 struct page *page = virt_to_page(pgd);
1501 pgd_t *user_pgd;
1502
1503 BUG_ON(page->private != 0);
1504
1505 ret = -ENOMEM;
1506
1507 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1508 page->private = (unsigned long)user_pgd;
1509
1510 if (user_pgd != NULL) {
1511 user_pgd[pgd_index(VSYSCALL_START)] =
1512 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1513 ret = 0;
1514 }
1515
1516 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1517 }
1518#endif
1519
1520 return ret;
1521}
1522
1523static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1524{
1525#ifdef CONFIG_X86_64
1526 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1527
1528 if (user_pgd)
1529 free_page((unsigned long)user_pgd);
1530#endif
1531}
1532
Jeremy Fitzhardinge1f4f9312009-02-02 13:58:06 -08001533#ifdef CONFIG_X86_32
1534static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1535{
1536 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1537 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1538 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1539 pte_val_ma(pte));
1540
1541 return pte;
1542}
1543
1544/* Init-time set_pte while constructing initial pagetables, which
1545 doesn't allow RO pagetable pages to be remapped RW */
1546static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1547{
1548 pte = mask_rw_pte(ptep, pte);
1549
1550 xen_set_pte(ptep, pte);
1551}
1552#endif
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001553
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001554static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1555{
1556 struct mmuext_op op;
1557 op.cmd = cmd;
1558 op.arg1.mfn = pfn_to_mfn(pfn);
1559 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1560 BUG();
1561}
1562
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001563/* Early in boot, while setting up the initial pagetable, assume
1564 everything is pinned. */
1565static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1566{
1567#ifdef CONFIG_FLATMEM
1568 BUG_ON(mem_map); /* should only be used early */
1569#endif
1570 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001571 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1572}
1573
1574/* Used for pmd and pud */
1575static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1576{
1577#ifdef CONFIG_FLATMEM
1578 BUG_ON(mem_map); /* should only be used early */
1579#endif
1580 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001581}
1582
1583/* Early release_pte assumes that all pts are pinned, since there's
1584 only init_mm and anything attached to that is pinned. */
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001585static __init void xen_release_pte_init(unsigned long pfn)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001586{
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001587 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001588 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1589}
1590
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001591static __init void xen_release_pmd_init(unsigned long pfn)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001592{
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001593 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001594}
1595
1596/* This needs to make sure the new pte page is pinned iff its being
1597 attached to a pinned pagetable. */
1598static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
1599{
1600 struct page *page = pfn_to_page(pfn);
1601
1602 if (PagePinned(virt_to_page(mm->pgd))) {
1603 SetPagePinned(page);
1604
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001605 if (!PageHighMem(page)) {
1606 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
1607 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1608 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1609 } else {
1610 /* make sure there are no stray mappings of
1611 this page */
1612 kmap_flush_unused();
1613 }
1614 }
1615}
1616
1617static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1618{
1619 xen_alloc_ptpage(mm, pfn, PT_PTE);
1620}
1621
1622static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1623{
1624 xen_alloc_ptpage(mm, pfn, PT_PMD);
1625}
1626
1627/* This should never happen until we're OK to use struct page */
1628static void xen_release_ptpage(unsigned long pfn, unsigned level)
1629{
1630 struct page *page = pfn_to_page(pfn);
1631
1632 if (PagePinned(page)) {
1633 if (!PageHighMem(page)) {
1634 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1635 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1636 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1637 }
1638 ClearPagePinned(page);
1639 }
1640}
1641
1642static void xen_release_pte(unsigned long pfn)
1643{
1644 xen_release_ptpage(pfn, PT_PTE);
1645}
1646
1647static void xen_release_pmd(unsigned long pfn)
1648{
1649 xen_release_ptpage(pfn, PT_PMD);
1650}
1651
1652#if PAGETABLE_LEVELS == 4
1653static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1654{
1655 xen_alloc_ptpage(mm, pfn, PT_PUD);
1656}
1657
1658static void xen_release_pud(unsigned long pfn)
1659{
1660 xen_release_ptpage(pfn, PT_PUD);
1661}
1662#endif
1663
1664void __init xen_reserve_top(void)
1665{
1666#ifdef CONFIG_X86_32
1667 unsigned long top = HYPERVISOR_VIRT_START;
1668 struct xen_platform_parameters pp;
1669
1670 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1671 top = pp.virt_start;
1672
1673 reserve_top_address(-top);
1674#endif /* CONFIG_X86_32 */
1675}
1676
1677/*
1678 * Like __va(), but returns address in the kernel mapping (which is
1679 * all we have until the physical memory mapping has been set up.
1680 */
1681static void *__ka(phys_addr_t paddr)
1682{
1683#ifdef CONFIG_X86_64
1684 return (void *)(paddr + __START_KERNEL_map);
1685#else
1686 return __va(paddr);
1687#endif
1688}
1689
1690/* Convert a machine address to physical address */
1691static unsigned long m2p(phys_addr_t maddr)
1692{
1693 phys_addr_t paddr;
1694
1695 maddr &= PTE_PFN_MASK;
1696 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1697
1698 return paddr;
1699}
1700
1701/* Convert a machine address to kernel virtual */
1702static void *m2v(phys_addr_t maddr)
1703{
1704 return __ka(m2p(maddr));
1705}
1706
1707static void set_page_prot(void *addr, pgprot_t prot)
1708{
1709 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1710 pte_t pte = pfn_pte(pfn, prot);
1711
1712 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1713 BUG();
1714}
1715
1716static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1717{
1718 unsigned pmdidx, pteidx;
1719 unsigned ident_pte;
1720 unsigned long pfn;
1721
Jeremy Fitzhardinge764f01382010-08-26 16:23:51 -07001722 level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
1723 PAGE_SIZE);
1724
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001725 ident_pte = 0;
1726 pfn = 0;
1727 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1728 pte_t *pte_page;
1729
1730 /* Reuse or allocate a page of ptes */
1731 if (pmd_present(pmd[pmdidx]))
1732 pte_page = m2v(pmd[pmdidx].pmd);
1733 else {
1734 /* Check for free pte pages */
Jeremy Fitzhardinge764f01382010-08-26 16:23:51 -07001735 if (ident_pte == LEVEL1_IDENT_ENTRIES)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001736 break;
1737
1738 pte_page = &level1_ident_pgt[ident_pte];
1739 ident_pte += PTRS_PER_PTE;
1740
1741 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1742 }
1743
1744 /* Install mappings */
1745 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1746 pte_t pte;
1747
1748 if (pfn > max_pfn_mapped)
1749 max_pfn_mapped = pfn;
1750
1751 if (!pte_none(pte_page[pteidx]))
1752 continue;
1753
1754 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1755 pte_page[pteidx] = pte;
1756 }
1757 }
1758
1759 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1760 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1761
1762 set_page_prot(pmd, PAGE_KERNEL_RO);
1763}
1764
1765#ifdef CONFIG_X86_64
1766static void convert_pfn_mfn(void *v)
1767{
1768 pte_t *pte = v;
1769 int i;
1770
1771 /* All levels are converted the same way, so just treat them
1772 as ptes. */
1773 for (i = 0; i < PTRS_PER_PTE; i++)
1774 pte[i] = xen_make_pte(pte[i].pte);
1775}
1776
1777/*
1778 * Set up the inital kernel pagetable.
1779 *
1780 * We can construct this by grafting the Xen provided pagetable into
1781 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1782 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1783 * means that only the kernel has a physical mapping to start with -
1784 * but that's enough to get __va working. We need to fill in the rest
1785 * of the physical mapping once some sort of allocator has been set
1786 * up.
1787 */
1788__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1789 unsigned long max_pfn)
1790{
1791 pud_t *l3;
1792 pmd_t *l2;
1793
1794 /* Zap identity mapping */
1795 init_level4_pgt[0] = __pgd(0);
1796
1797 /* Pre-constructed entries are in pfn, so convert to mfn */
1798 convert_pfn_mfn(init_level4_pgt);
1799 convert_pfn_mfn(level3_ident_pgt);
1800 convert_pfn_mfn(level3_kernel_pgt);
1801
1802 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1803 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1804
1805 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1806 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1807
1808 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1809 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1810 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1811
1812 /* Set up identity map */
1813 xen_map_identity_early(level2_ident_pgt, max_pfn);
1814
1815 /* Make pagetable pieces RO */
1816 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1817 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1818 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1819 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1820 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1821 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1822
1823 /* Pin down new L4 */
1824 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1825 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1826
1827 /* Unpin Xen-provided one */
1828 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1829
1830 /* Switch over */
1831 pgd = init_level4_pgt;
1832
1833 /*
1834 * At this stage there can be no user pgd, and no page
1835 * structure to attach it to, so make sure we just set kernel
1836 * pgd.
1837 */
1838 xen_mc_batch();
1839 __xen_write_cr3(true, __pa(pgd));
1840 xen_mc_issue(PARAVIRT_LAZY_CPU);
1841
1842 reserve_early(__pa(xen_start_info->pt_base),
1843 __pa(xen_start_info->pt_base +
1844 xen_start_info->nr_pt_frames * PAGE_SIZE),
1845 "XEN PAGETABLES");
1846
1847 return pgd;
1848}
1849#else /* !CONFIG_X86_64 */
Jeremy Fitzhardingef0991802010-08-26 16:16:28 -07001850static RESERVE_BRK_ARRAY(pmd_t, level2_kernel_pgt, PTRS_PER_PMD);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001851
1852__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1853 unsigned long max_pfn)
1854{
1855 pmd_t *kernel_pmd;
1856
Jeremy Fitzhardingef0991802010-08-26 16:16:28 -07001857 level2_kernel_pgt = extend_brk(sizeof(pmd_t *) * PTRS_PER_PMD, PAGE_SIZE);
1858
Jeremy Fitzhardinge93dbda72009-02-26 17:35:44 -08001859 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
1860 xen_start_info->nr_pt_frames * PAGE_SIZE +
1861 512*1024);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001862
1863 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1864 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1865
1866 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1867
1868 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1869 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1870 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1871
1872 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1873 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1874 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1875
1876 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1877
1878 xen_write_cr3(__pa(swapper_pg_dir));
1879
1880 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1881
Jeremy Fitzhardinge33df4db2009-05-07 11:56:44 -07001882 reserve_early(__pa(xen_start_info->pt_base),
1883 __pa(xen_start_info->pt_base +
1884 xen_start_info->nr_pt_frames * PAGE_SIZE),
1885 "XEN PAGETABLES");
1886
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001887 return swapper_pg_dir;
1888}
1889#endif /* CONFIG_X86_64 */
1890
Masami Hiramatsu3b3809a2009-04-09 10:55:33 -07001891static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001892{
1893 pte_t pte;
1894
1895 phys >>= PAGE_SHIFT;
1896
1897 switch (idx) {
1898 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1899#ifdef CONFIG_X86_F00F_BUG
1900 case FIX_F00F_IDT:
1901#endif
1902#ifdef CONFIG_X86_32
1903 case FIX_WP_TEST:
1904 case FIX_VDSO:
1905# ifdef CONFIG_HIGHMEM
1906 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1907# endif
1908#else
1909 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1910#endif
1911#ifdef CONFIG_X86_LOCAL_APIC
1912 case FIX_APIC_BASE: /* maps dummy local APIC */
1913#endif
Jeremy Fitzhardinge3ecb1b72009-03-07 23:48:41 -08001914 case FIX_TEXT_POKE0:
1915 case FIX_TEXT_POKE1:
1916 /* All local page mappings */
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001917 pte = pfn_pte(phys, prot);
1918 break;
1919
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -08001920 case FIX_PARAVIRT_BOOTMAP:
1921 /* This is an MFN, but it isn't an IO mapping from the
1922 IO domain */
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001923 pte = mfn_pte(phys, prot);
1924 break;
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -08001925
1926 default:
1927 /* By default, set_fixmap is used for hardware mappings */
1928 pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
1929 break;
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001930 }
1931
1932 __native_set_fixmap(idx, pte);
1933
1934#ifdef CONFIG_X86_64
1935 /* Replicate changes to map the vsyscall page into the user
1936 pagetable vsyscall mapping. */
1937 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1938 unsigned long vaddr = __fix_to_virt(idx);
1939 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1940 }
1941#endif
1942}
1943
Thomas Gleixnerf1d70622009-08-20 13:13:52 +02001944static __init void xen_post_allocator_init(void)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001945{
1946 pv_mmu_ops.set_pte = xen_set_pte;
1947 pv_mmu_ops.set_pmd = xen_set_pmd;
1948 pv_mmu_ops.set_pud = xen_set_pud;
1949#if PAGETABLE_LEVELS == 4
1950 pv_mmu_ops.set_pgd = xen_set_pgd;
1951#endif
1952
1953 /* This will work as long as patching hasn't happened yet
1954 (which it hasn't) */
1955 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1956 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1957 pv_mmu_ops.release_pte = xen_release_pte;
1958 pv_mmu_ops.release_pmd = xen_release_pmd;
1959#if PAGETABLE_LEVELS == 4
1960 pv_mmu_ops.alloc_pud = xen_alloc_pud;
1961 pv_mmu_ops.release_pud = xen_release_pud;
1962#endif
1963
1964#ifdef CONFIG_X86_64
1965 SetPagePinned(virt_to_page(level3_user_vsyscall));
1966#endif
1967 xen_mark_init_mm_pinned();
1968}
1969
Jeremy Fitzhardingeb407fc52009-02-17 23:46:21 -08001970static void xen_leave_lazy_mmu(void)
1971{
Jeremy Fitzhardinge5caecb92009-02-20 23:01:26 -08001972 preempt_disable();
Jeremy Fitzhardingeb407fc52009-02-17 23:46:21 -08001973 xen_mc_flush();
1974 paravirt_leave_lazy_mmu();
Jeremy Fitzhardinge5caecb92009-02-20 23:01:26 -08001975 preempt_enable();
Jeremy Fitzhardingeb407fc52009-02-17 23:46:21 -08001976}
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001977
Thomas Gleixner030cb6c2009-08-20 14:30:02 +02001978static const struct pv_mmu_ops xen_mmu_ops __initdata = {
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001979 .read_cr2 = xen_read_cr2,
1980 .write_cr2 = xen_write_cr2,
1981
1982 .read_cr3 = xen_read_cr3,
1983 .write_cr3 = xen_write_cr3,
1984
1985 .flush_tlb_user = xen_flush_tlb,
1986 .flush_tlb_kernel = xen_flush_tlb,
1987 .flush_tlb_single = xen_flush_tlb_single,
1988 .flush_tlb_others = xen_flush_tlb_others,
1989
1990 .pte_update = paravirt_nop,
1991 .pte_update_defer = paravirt_nop,
1992
1993 .pgd_alloc = xen_pgd_alloc,
1994 .pgd_free = xen_pgd_free,
1995
1996 .alloc_pte = xen_alloc_pte_init,
1997 .release_pte = xen_release_pte_init,
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001998 .alloc_pmd = xen_alloc_pmd_init,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001999 .alloc_pmd_clone = paravirt_nop,
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07002000 .release_pmd = xen_release_pmd_init,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002001
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002002#ifdef CONFIG_X86_64
2003 .set_pte = xen_set_pte,
2004#else
2005 .set_pte = xen_set_pte_init,
2006#endif
2007 .set_pte_at = xen_set_pte_at,
2008 .set_pmd = xen_set_pmd_hyper,
2009
2010 .ptep_modify_prot_start = __ptep_modify_prot_start,
2011 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
2012
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08002013 .pte_val = PV_CALLEE_SAVE(xen_pte_val),
2014 .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002015
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08002016 .make_pte = PV_CALLEE_SAVE(xen_make_pte),
2017 .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002018
2019#ifdef CONFIG_X86_PAE
2020 .set_pte_atomic = xen_set_pte_atomic,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002021 .pte_clear = xen_pte_clear,
2022 .pmd_clear = xen_pmd_clear,
2023#endif /* CONFIG_X86_PAE */
2024 .set_pud = xen_set_pud_hyper,
2025
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08002026 .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
2027 .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002028
2029#if PAGETABLE_LEVELS == 4
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08002030 .pud_val = PV_CALLEE_SAVE(xen_pud_val),
2031 .make_pud = PV_CALLEE_SAVE(xen_make_pud),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002032 .set_pgd = xen_set_pgd_hyper,
2033
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07002034 .alloc_pud = xen_alloc_pmd_init,
2035 .release_pud = xen_release_pmd_init,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002036#endif /* PAGETABLE_LEVELS == 4 */
2037
2038 .activate_mm = xen_activate_mm,
2039 .dup_mmap = xen_dup_mmap,
2040 .exit_mmap = xen_exit_mmap,
2041
2042 .lazy_mode = {
2043 .enter = paravirt_enter_lazy_mmu,
Jeremy Fitzhardingeb407fc52009-02-17 23:46:21 -08002044 .leave = xen_leave_lazy_mmu,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002045 },
2046
2047 .set_fixmap = xen_set_fixmap,
2048};
2049
Thomas Gleixner030cb6c2009-08-20 14:30:02 +02002050void __init xen_init_mmu_ops(void)
2051{
2052 x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
2053 x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
2054 pv_mmu_ops = xen_mmu_ops;
Jeremy Fitzhardinged2cb2142010-03-26 15:37:50 -07002055
2056 vmap_lazy_unmap = false;
Thomas Gleixner030cb6c2009-08-20 14:30:02 +02002057}
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002058
Alex Nixon08bbc9d2009-02-09 12:05:46 -08002059/* Protected by xen_reservation_lock. */
2060#define MAX_CONTIG_ORDER 9 /* 2MB */
2061static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
2062
2063#define VOID_PTE (mfn_pte(0, __pgprot(0)))
2064static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2065 unsigned long *in_frames,
2066 unsigned long *out_frames)
2067{
2068 int i;
2069 struct multicall_space mcs;
2070
2071 xen_mc_batch();
2072 for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
2073 mcs = __xen_mc_entry(0);
2074
2075 if (in_frames)
2076 in_frames[i] = virt_to_mfn(vaddr);
2077
2078 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
2079 set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
2080
2081 if (out_frames)
2082 out_frames[i] = virt_to_pfn(vaddr);
2083 }
2084 xen_mc_issue(0);
2085}
2086
2087/*
2088 * Update the pfn-to-mfn mappings for a virtual address range, either to
2089 * point to an array of mfns, or contiguously from a single starting
2090 * mfn.
2091 */
2092static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
2093 unsigned long *mfns,
2094 unsigned long first_mfn)
2095{
2096 unsigned i, limit;
2097 unsigned long mfn;
2098
2099 xen_mc_batch();
2100
2101 limit = 1u << order;
2102 for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
2103 struct multicall_space mcs;
2104 unsigned flags;
2105
2106 mcs = __xen_mc_entry(0);
2107 if (mfns)
2108 mfn = mfns[i];
2109 else
2110 mfn = first_mfn + i;
2111
2112 if (i < (limit - 1))
2113 flags = 0;
2114 else {
2115 if (order == 0)
2116 flags = UVMF_INVLPG | UVMF_ALL;
2117 else
2118 flags = UVMF_TLB_FLUSH | UVMF_ALL;
2119 }
2120
2121 MULTI_update_va_mapping(mcs.mc, vaddr,
2122 mfn_pte(mfn, PAGE_KERNEL), flags);
2123
2124 set_phys_to_machine(virt_to_pfn(vaddr), mfn);
2125 }
2126
2127 xen_mc_issue(0);
2128}
2129
2130/*
2131 * Perform the hypercall to exchange a region of our pfns to point to
2132 * memory with the required contiguous alignment. Takes the pfns as
2133 * input, and populates mfns as output.
2134 *
2135 * Returns a success code indicating whether the hypervisor was able to
2136 * satisfy the request or not.
2137 */
2138static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
2139 unsigned long *pfns_in,
2140 unsigned long extents_out,
2141 unsigned int order_out,
2142 unsigned long *mfns_out,
2143 unsigned int address_bits)
2144{
2145 long rc;
2146 int success;
2147
2148 struct xen_memory_exchange exchange = {
2149 .in = {
2150 .nr_extents = extents_in,
2151 .extent_order = order_in,
2152 .extent_start = pfns_in,
2153 .domid = DOMID_SELF
2154 },
2155 .out = {
2156 .nr_extents = extents_out,
2157 .extent_order = order_out,
2158 .extent_start = mfns_out,
2159 .address_bits = address_bits,
2160 .domid = DOMID_SELF
2161 }
2162 };
2163
2164 BUG_ON(extents_in << order_in != extents_out << order_out);
2165
2166 rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
2167 success = (exchange.nr_exchanged == extents_in);
2168
2169 BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
2170 BUG_ON(success && (rc != 0));
2171
2172 return success;
2173}
2174
2175int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
2176 unsigned int address_bits)
2177{
2178 unsigned long *in_frames = discontig_frames, out_frame;
2179 unsigned long flags;
2180 int success;
2181
2182 /*
2183 * Currently an auto-translated guest will not perform I/O, nor will
2184 * it require PAE page directories below 4GB. Therefore any calls to
2185 * this function are redundant and can be ignored.
2186 */
2187
2188 if (xen_feature(XENFEAT_auto_translated_physmap))
2189 return 0;
2190
2191 if (unlikely(order > MAX_CONTIG_ORDER))
2192 return -ENOMEM;
2193
2194 memset((void *) vstart, 0, PAGE_SIZE << order);
2195
Alex Nixon08bbc9d2009-02-09 12:05:46 -08002196 spin_lock_irqsave(&xen_reservation_lock, flags);
2197
2198 /* 1. Zap current PTEs, remembering MFNs. */
2199 xen_zap_pfn_range(vstart, order, in_frames, NULL);
2200
2201 /* 2. Get a new contiguous memory extent. */
2202 out_frame = virt_to_pfn(vstart);
2203 success = xen_exchange_memory(1UL << order, 0, in_frames,
2204 1, order, &out_frame,
2205 address_bits);
2206
2207 /* 3. Map the new extent in place of old pages. */
2208 if (success)
2209 xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
2210 else
2211 xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
2212
2213 spin_unlock_irqrestore(&xen_reservation_lock, flags);
2214
2215 return success ? 0 : -ENOMEM;
2216}
2217EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
2218
2219void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
2220{
2221 unsigned long *out_frames = discontig_frames, in_frame;
2222 unsigned long flags;
2223 int success;
2224
2225 if (xen_feature(XENFEAT_auto_translated_physmap))
2226 return;
2227
2228 if (unlikely(order > MAX_CONTIG_ORDER))
2229 return;
2230
2231 memset((void *) vstart, 0, PAGE_SIZE << order);
2232
Alex Nixon08bbc9d2009-02-09 12:05:46 -08002233 spin_lock_irqsave(&xen_reservation_lock, flags);
2234
2235 /* 1. Find start MFN of contiguous extent. */
2236 in_frame = virt_to_mfn(vstart);
2237
2238 /* 2. Zap current PTEs. */
2239 xen_zap_pfn_range(vstart, order, NULL, out_frames);
2240
2241 /* 3. Do the exchange for non-contiguous MFNs. */
2242 success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
2243 0, out_frames, 0);
2244
2245 /* 4. Map new pages in place of old pages. */
2246 if (success)
2247 xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
2248 else
2249 xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
2250
2251 spin_unlock_irqrestore(&xen_reservation_lock, flags);
2252}
2253EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
2254
Stefano Stabellinica65f9f2010-07-29 14:37:48 +01002255#ifdef CONFIG_XEN_PVHVM
Stefano Stabellini59151002010-06-17 14:22:52 +01002256static void xen_hvm_exit_mmap(struct mm_struct *mm)
2257{
2258 struct xen_hvm_pagetable_dying a;
2259 int rc;
2260
2261 a.domid = DOMID_SELF;
2262 a.gpa = __pa(mm->pgd);
2263 rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2264 WARN_ON_ONCE(rc < 0);
2265}
2266
2267static int is_pagetable_dying_supported(void)
2268{
2269 struct xen_hvm_pagetable_dying a;
2270 int rc = 0;
2271
2272 a.domid = DOMID_SELF;
2273 a.gpa = 0x00;
2274 rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2275 if (rc < 0) {
2276 printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
2277 return 0;
2278 }
2279 return 1;
2280}
2281
2282void __init xen_hvm_init_mmu_ops(void)
2283{
2284 if (is_pagetable_dying_supported())
2285 pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
2286}
Stefano Stabellinica65f9f2010-07-29 14:37:48 +01002287#endif
Stefano Stabellini59151002010-06-17 14:22:52 +01002288
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -07002289#ifdef CONFIG_XEN_DEBUG_FS
2290
2291static struct dentry *d_mmu_debug;
2292
2293static int __init xen_mmu_debugfs(void)
2294{
2295 struct dentry *d_xen = xen_init_debugfs();
2296
2297 if (d_xen == NULL)
2298 return -ENOMEM;
2299
2300 d_mmu_debug = debugfs_create_dir("mmu", d_xen);
2301
2302 debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
2303
2304 debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
2305 debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
2306 &mmu_stats.pgd_update_pinned);
2307 debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
2308 &mmu_stats.pgd_update_pinned);
2309
2310 debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
2311 debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
2312 &mmu_stats.pud_update_pinned);
2313 debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
2314 &mmu_stats.pud_update_pinned);
2315
2316 debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
2317 debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
2318 &mmu_stats.pmd_update_pinned);
2319 debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
2320 &mmu_stats.pmd_update_pinned);
2321
2322 debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
2323// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
2324// &mmu_stats.pte_update_pinned);
2325 debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
2326 &mmu_stats.pte_update_pinned);
2327
2328 debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
2329 debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
2330 &mmu_stats.mmu_update_extended);
2331 xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
2332 mmu_stats.mmu_update_histo, 20);
2333
2334 debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
2335 debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
2336 &mmu_stats.set_pte_at_batched);
2337 debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
2338 &mmu_stats.set_pte_at_current);
2339 debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
2340 &mmu_stats.set_pte_at_kernel);
2341
2342 debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
2343 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
2344 &mmu_stats.prot_commit_batched);
2345
2346 return 0;
2347}
2348fs_initcall(xen_mmu_debugfs);
2349
2350#endif /* CONFIG_XEN_DEBUG_FS */