blob: a5577f59416a473ccd511970d439469a092b5c16 [file] [log] [blame]
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001/*
2 * Xen mmu operations
3 *
4 * This file contains the various mmu fetch and update operations.
5 * The most important job they must perform is the mapping between the
6 * domain's pfn and the overall machine mfns.
7 *
8 * Xen allows guests to directly update the pagetable, in a controlled
9 * fashion. In other words, the guest modifies the same pagetable
10 * that the CPU actually uses, which eliminates the overhead of having
11 * a separate shadow pagetable.
12 *
13 * In order to allow this, it falls on the guest domain to map its
14 * notion of a "physical" pfn - which is just a domain-local linear
15 * address - into a real "machine address" which the CPU's MMU can
16 * use.
17 *
18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19 * inserted directly into the pagetable. When creating a new
20 * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
21 * when reading the content back with __(pgd|pmd|pte)_val, it converts
22 * the mfn back into a pfn.
23 *
24 * The other constraint is that all pages which make up a pagetable
25 * must be mapped read-only in the guest. This prevents uncontrolled
26 * guest updates to the pagetable. Xen strictly enforces this, and
27 * will disallow any pagetable update which will end up mapping a
28 * pagetable page RW, and will disallow using any writable page as a
29 * pagetable.
30 *
31 * Naively, when loading %cr3 with the base of a new pagetable, Xen
32 * would need to validate the whole pagetable before going on.
33 * Naturally, this is quite slow. The solution is to "pin" a
34 * pagetable, which enforces all the constraints on the pagetable even
35 * when it is not actively in use. This menas that Xen can be assured
36 * that it is still valid when you do load it into %cr3, and doesn't
37 * need to revalidate it.
38 *
39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
40 */
Jeremy Fitzhardingef120f132007-07-17 18:37:06 -070041#include <linux/sched.h>
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070042#include <linux/highmem.h>
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -070043#include <linux/debugfs.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070044#include <linux/bug.h>
Randy Dunlap44408ad2009-05-12 13:31:40 -070045#include <linux/module.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090046#include <linux/gfp.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070047
48#include <asm/pgtable.h>
49#include <asm/tlbflush.h>
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -070050#include <asm/fixmap.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070051#include <asm/mmu_context.h>
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -080052#include <asm/setup.h>
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070053#include <asm/paravirt.h>
Alex Nixon7347b402010-02-19 13:31:06 -050054#include <asm/e820.h>
Jeremy Fitzhardingecbcd79c2008-07-08 15:06:27 -070055#include <asm/linkage.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070056
57#include <asm/xen/hypercall.h>
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070058#include <asm/xen/hypervisor.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070059
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -080060#include <xen/xen.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070061#include <xen/page.h>
62#include <xen/interface/xen.h>
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -080063#include <xen/interface/version.h>
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -080064#include <xen/interface/memory.h>
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -080065#include <xen/hvc-console.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070066
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070067#include "multicalls.h"
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070068#include "mmu.h"
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -070069#include "debugfs.h"
70
71#define MMU_UPDATE_HISTO 30
72
73#ifdef CONFIG_XEN_DEBUG_FS
74
75static struct {
76 u32 pgd_update;
77 u32 pgd_update_pinned;
78 u32 pgd_update_batched;
79
80 u32 pud_update;
81 u32 pud_update_pinned;
82 u32 pud_update_batched;
83
84 u32 pmd_update;
85 u32 pmd_update_pinned;
86 u32 pmd_update_batched;
87
88 u32 pte_update;
89 u32 pte_update_pinned;
90 u32 pte_update_batched;
91
92 u32 mmu_update;
93 u32 mmu_update_extended;
94 u32 mmu_update_histo[MMU_UPDATE_HISTO];
95
96 u32 prot_commit;
97 u32 prot_commit_batched;
98
99 u32 set_pte_at;
100 u32 set_pte_at_batched;
101 u32 set_pte_at_pinned;
102 u32 set_pte_at_current;
103 u32 set_pte_at_kernel;
104} mmu_stats;
105
106static u8 zero_stats;
107
108static inline void check_zero(void)
109{
110 if (unlikely(zero_stats)) {
111 memset(&mmu_stats, 0, sizeof(mmu_stats));
112 zero_stats = 0;
113 }
114}
115
116#define ADD_STATS(elem, val) \
117 do { check_zero(); mmu_stats.elem += (val); } while(0)
118
119#else /* !CONFIG_XEN_DEBUG_FS */
120
121#define ADD_STATS(elem, val) do { (void)(val); } while(0)
122
123#endif /* CONFIG_XEN_DEBUG_FS */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700124
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -0800125
126/*
127 * Identity map, in addition to plain kernel map. This needs to be
128 * large enough to allocate page table pages to allocate the rest.
129 * Each page can map 2MB.
130 */
131static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
132
133#ifdef CONFIG_X86_64
134/* l3 pud for userspace vsyscall mapping */
135static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
136#endif /* CONFIG_X86_64 */
137
138/*
139 * Note about cr3 (pagetable base) values:
140 *
141 * xen_cr3 contains the current logical cr3 value; it contains the
142 * last set cr3. This may not be the current effective cr3, because
143 * its update may be being lazily deferred. However, a vcpu looking
144 * at its own cr3 can use this value knowing that it everything will
145 * be self-consistent.
146 *
147 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
148 * hypercall to set the vcpu cr3 is complete (so it may be a little
149 * out of date, but it will never be set early). If one vcpu is
150 * looking at another vcpu's cr3 value, it should use this variable.
151 */
152DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
153DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
154
155
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700156/*
157 * Just beyond the highest usermode address. STACK_TOP_MAX has a
158 * redzone above it, so round it up to a PGD boundary.
159 */
160#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
161
162
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100163#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
Jeremy Fitzhardingecf0923e2008-05-26 23:31:20 +0100164#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100165
Jeremy Fitzhardingecf0923e2008-05-26 23:31:20 +0100166/* Placeholder for holes in the address space */
Jeremy Fitzhardingecbcd79c2008-07-08 15:06:27 -0700167static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
Jeremy Fitzhardingecf0923e2008-05-26 23:31:20 +0100168 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
169
170 /* Array of pointers to pages containing p2m entries */
Jeremy Fitzhardingecbcd79c2008-07-08 15:06:27 -0700171static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
Jeremy Fitzhardingecf0923e2008-05-26 23:31:20 +0100172 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100173
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100174/* Arrays of p2m arrays expressed in mfns used for save/restore */
Jeremy Fitzhardingecbcd79c2008-07-08 15:06:27 -0700175static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100176
Jeremy Fitzhardingecbcd79c2008-07-08 15:06:27 -0700177static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
178 __page_aligned_bss;
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100179
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100180static inline unsigned p2m_top_index(unsigned long pfn)
181{
Jeremy Fitzhardinge8006ec32008-05-26 23:31:19 +0100182 BUG_ON(pfn >= MAX_DOMAIN_PAGES);
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100183 return pfn / P2M_ENTRIES_PER_PAGE;
184}
185
186static inline unsigned p2m_index(unsigned long pfn)
187{
188 return pfn % P2M_ENTRIES_PER_PAGE;
189}
190
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100191/* Build the parallel p2m_top_mfn structures */
Ian Campbellfa24ba62009-11-21 11:32:49 +0000192void xen_build_mfn_list_list(void)
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100193{
194 unsigned pfn, idx;
195
Tejf63c2f22008-12-16 11:56:06 -0800196 for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100197 unsigned topidx = p2m_top_index(pfn);
198
199 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
200 }
201
Tejf63c2f22008-12-16 11:56:06 -0800202 for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100203 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
204 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
205 }
Jeremy Fitzhardingecdaead62009-02-27 15:34:59 -0800206}
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100207
Jeremy Fitzhardingecdaead62009-02-27 15:34:59 -0800208void xen_setup_mfn_list_list(void)
209{
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100210 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
211
212 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
213 virt_to_mfn(p2m_top_mfn_list);
214 HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
215}
216
217/* Set up p2m_top to point to the domain-builder provided p2m pages */
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100218void __init xen_build_dynamic_phys_to_machine(void)
219{
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100220 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
Jeremy Fitzhardinge8006ec32008-05-26 23:31:19 +0100221 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100222 unsigned pfn;
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100223
Tejf63c2f22008-12-16 11:56:06 -0800224 for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100225 unsigned topidx = p2m_top_index(pfn);
226
227 p2m_top[topidx] = &mfn_list[pfn];
228 }
Jeremy Fitzhardingecdaead62009-02-27 15:34:59 -0800229
230 xen_build_mfn_list_list();
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100231}
232
233unsigned long get_phys_to_machine(unsigned long pfn)
234{
235 unsigned topidx, idx;
236
Jeremy Fitzhardinge8006ec32008-05-26 23:31:19 +0100237 if (unlikely(pfn >= MAX_DOMAIN_PAGES))
238 return INVALID_P2M_ENTRY;
239
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100240 topidx = p2m_top_index(pfn);
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100241 idx = p2m_index(pfn);
242 return p2m_top[topidx][idx];
243}
Ingo Molnar15ce60052008-06-02 13:20:11 +0200244EXPORT_SYMBOL_GPL(get_phys_to_machine);
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100245
Jeremy Fitzhardingee791ca02009-02-26 15:48:33 -0800246/* install a new p2m_top page */
247bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100248{
Jeremy Fitzhardingee791ca02009-02-26 15:48:33 -0800249 unsigned topidx = p2m_top_index(pfn);
250 unsigned long **pfnp, *mfnp;
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100251 unsigned i;
252
Jeremy Fitzhardingee791ca02009-02-26 15:48:33 -0800253 pfnp = &p2m_top[topidx];
254 mfnp = &p2m_top_mfn[topidx];
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100255
Tejf63c2f22008-12-16 11:56:06 -0800256 for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100257 p[i] = INVALID_P2M_ENTRY;
258
Jeremy Fitzhardingee791ca02009-02-26 15:48:33 -0800259 if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) {
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100260 *mfnp = virt_to_mfn(p);
Jeremy Fitzhardingee791ca02009-02-26 15:48:33 -0800261 return true;
262 }
263
264 return false;
265}
266
267static void alloc_p2m(unsigned long pfn)
268{
269 unsigned long *p;
270
271 p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
272 BUG_ON(p == NULL);
273
274 if (!install_p2mtop_page(pfn, p))
275 free_page((unsigned long)p);
276}
277
278/* Try to install p2m mapping; fail if intermediate bits missing */
279bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
280{
281 unsigned topidx, idx;
282
283 if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
284 BUG_ON(mfn != INVALID_P2M_ENTRY);
285 return true;
286 }
287
288 topidx = p2m_top_index(pfn);
289 if (p2m_top[topidx] == p2m_missing) {
290 if (mfn == INVALID_P2M_ENTRY)
291 return true;
292 return false;
293 }
294
295 idx = p2m_index(pfn);
296 p2m_top[topidx][idx] = mfn;
297
298 return true;
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100299}
300
301void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
302{
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100303 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
304 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
305 return;
306 }
307
Jeremy Fitzhardingee791ca02009-02-26 15:48:33 -0800308 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
309 alloc_p2m(pfn);
Jeremy Fitzhardinge8006ec32008-05-26 23:31:19 +0100310
Jeremy Fitzhardingee791ca02009-02-26 15:48:33 -0800311 if (!__set_phys_to_machine(pfn, mfn))
312 BUG();
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100313 }
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100314}
315
Jeremy Fitzhardinge9976b392009-02-27 09:19:26 -0800316unsigned long arbitrary_virt_to_mfn(void *vaddr)
317{
318 xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
319
320 return PFN_DOWN(maddr.maddr);
321}
322
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700323xmaddr_t arbitrary_virt_to_machine(void *vaddr)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700324{
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700325 unsigned long address = (unsigned long)vaddr;
Harvey Harrisonda7bfc52008-02-09 23:24:08 +0100326 unsigned int level;
Chris Lalancette9f32d212008-10-23 17:40:25 -0700327 pte_t *pte;
328 unsigned offset;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700329
Chris Lalancette9f32d212008-10-23 17:40:25 -0700330 /*
331 * if the PFN is in the linear mapped vaddr range, we can just use
332 * the (quick) virt_to_machine() p2m lookup
333 */
334 if (virt_addr_valid(vaddr))
335 return virt_to_machine(vaddr);
336
337 /* otherwise we have to do a (slower) full page-table walk */
338
339 pte = lookup_address(address, &level);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700340 BUG_ON(pte == NULL);
Chris Lalancette9f32d212008-10-23 17:40:25 -0700341 offset = address & ~PAGE_MASK;
Jeremy Fitzhardingeebd879e2008-07-08 15:06:54 -0700342 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700343}
344
345void make_lowmem_page_readonly(void *vaddr)
346{
347 pte_t *pte, ptev;
348 unsigned long address = (unsigned long)vaddr;
Harvey Harrisonda7bfc52008-02-09 23:24:08 +0100349 unsigned int level;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700350
Ingo Molnarf0646e42008-01-30 13:33:43 +0100351 pte = lookup_address(address, &level);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700352 BUG_ON(pte == NULL);
353
354 ptev = pte_wrprotect(*pte);
355
356 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
357 BUG();
358}
359
360void make_lowmem_page_readwrite(void *vaddr)
361{
362 pte_t *pte, ptev;
363 unsigned long address = (unsigned long)vaddr;
Harvey Harrisonda7bfc52008-02-09 23:24:08 +0100364 unsigned int level;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700365
Ingo Molnarf0646e42008-01-30 13:33:43 +0100366 pte = lookup_address(address, &level);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700367 BUG_ON(pte == NULL);
368
369 ptev = pte_mkwrite(*pte);
370
371 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
372 BUG();
373}
374
375
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700376static bool xen_page_pinned(void *ptr)
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100377{
378 struct page *page = virt_to_page(ptr);
379
380 return PagePinned(page);
381}
382
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800383static bool xen_iomap_pte(pte_t pte)
384{
Alex Nixon7347b402010-02-19 13:31:06 -0500385 return pte_flags(pte) & _PAGE_IOMAP;
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800386}
387
388static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
389{
390 struct multicall_space mcs;
391 struct mmu_update *u;
392
393 mcs = xen_mc_entry(sizeof(*u));
394 u = mcs.args;
395
396 /* ptep might be kmapped when using 32-bit HIGHPTE */
397 u->ptr = arbitrary_virt_to_machine(ptep).maddr;
398 u->val = pte_val_ma(pteval);
399
400 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_IO);
401
402 xen_mc_issue(PARAVIRT_LAZY_MMU);
403}
404
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700405static void xen_extend_mmu_update(const struct mmu_update *update)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700406{
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700407 struct multicall_space mcs;
408 struct mmu_update *u;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700409
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700410 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
411
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700412 if (mcs.mc != NULL) {
413 ADD_STATS(mmu_update_extended, 1);
414 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
415
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700416 mcs.mc->args[1]++;
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700417
418 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
419 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
420 else
421 ADD_STATS(mmu_update_histo[0], 1);
422 } else {
423 ADD_STATS(mmu_update, 1);
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700424 mcs = __xen_mc_entry(sizeof(*u));
425 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700426 ADD_STATS(mmu_update_histo[1], 1);
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700427 }
428
429 u = mcs.args;
430 *u = *update;
431}
432
433void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
434{
435 struct mmu_update u;
436
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700437 preempt_disable();
438
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700439 xen_mc_batch();
440
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700441 /* ptr may be ioremapped for 64-bit pagetable setup */
442 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700443 u.val = pmd_val_ma(val);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700444 xen_extend_mmu_update(&u);
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700445
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700446 ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
447
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700448 xen_mc_issue(PARAVIRT_LAZY_MMU);
449
450 preempt_enable();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700451}
452
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100453void xen_set_pmd(pmd_t *ptr, pmd_t val)
454{
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700455 ADD_STATS(pmd_update, 1);
456
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100457 /* If page is not pinned, we can just update the entry
458 directly */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700459 if (!xen_page_pinned(ptr)) {
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100460 *ptr = val;
461 return;
462 }
463
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700464 ADD_STATS(pmd_update_pinned, 1);
465
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100466 xen_set_pmd_hyper(ptr, val);
467}
468
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700469/*
470 * Associate a virtual page frame with a given physical page frame
471 * and protection flags for that frame.
472 */
473void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
474{
Jeremy Fitzhardinge836fe2f2008-07-08 15:06:58 -0700475 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700476}
477
478void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
479 pte_t *ptep, pte_t pteval)
480{
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800481 if (xen_iomap_pte(pteval)) {
482 xen_set_iomap_pte(ptep, pteval);
483 goto out;
484 }
485
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700486 ADD_STATS(set_pte_at, 1);
487// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
488 ADD_STATS(set_pte_at_current, mm == current->mm);
489 ADD_STATS(set_pte_at_kernel, mm == &init_mm);
490
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700491 if (mm == current->mm || mm == &init_mm) {
Jeremy Fitzhardinge8965c1c2007-10-16 11:51:29 -0700492 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700493 struct multicall_space mcs;
494 mcs = xen_mc_entry(0);
495
496 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700497 ADD_STATS(set_pte_at_batched, 1);
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700498 xen_mc_issue(PARAVIRT_LAZY_MMU);
Jeremy Fitzhardinge2bd50032008-04-02 10:54:10 -0700499 goto out;
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700500 } else
501 if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
Jeremy Fitzhardinge2bd50032008-04-02 10:54:10 -0700502 goto out;
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700503 }
504 xen_set_pte(ptep, pteval);
Jeremy Fitzhardinge2bd50032008-04-02 10:54:10 -0700505
Jeremy Fitzhardinge2829b442009-02-17 23:53:19 -0800506out: return;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700507}
508
Tejf63c2f22008-12-16 11:56:06 -0800509pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
510 unsigned long addr, pte_t *ptep)
Jeremy Fitzhardingee57778a2008-06-16 04:30:02 -0700511{
512 /* Just return the pte as-is. We preserve the bits on commit */
513 return *ptep;
514}
515
516void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
517 pte_t *ptep, pte_t pte)
518{
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700519 struct mmu_update u;
Jeremy Fitzhardingee57778a2008-06-16 04:30:02 -0700520
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700521 xen_mc_batch();
522
Chris Lalancette9f32d212008-10-23 17:40:25 -0700523 u.ptr = arbitrary_virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700524 u.val = pte_val_ma(pte);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700525 xen_extend_mmu_update(&u);
Jeremy Fitzhardingee57778a2008-06-16 04:30:02 -0700526
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700527 ADD_STATS(prot_commit, 1);
528 ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
529
Jeremy Fitzhardingee57778a2008-06-16 04:30:02 -0700530 xen_mc_issue(PARAVIRT_LAZY_MMU);
531}
532
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700533/* Assume pteval_t is equivalent to all the other *val_t types. */
534static pteval_t pte_mfn_to_pfn(pteval_t val)
535{
536 if (val & _PAGE_PRESENT) {
Jeremy Fitzhardinge59438c92008-07-21 22:59:42 -0700537 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
Jeremy Fitzhardinge77be1fa2008-07-21 22:59:56 -0700538 pteval_t flags = val & PTE_FLAGS_MASK;
Jeremy Fitzhardinged8355ac2008-07-03 22:10:18 -0700539 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700540 }
541
542 return val;
543}
544
545static pteval_t pte_pfn_to_mfn(pteval_t val)
546{
547 if (val & _PAGE_PRESENT) {
Jeremy Fitzhardinge59438c92008-07-21 22:59:42 -0700548 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
Jeremy Fitzhardinge77be1fa2008-07-21 22:59:56 -0700549 pteval_t flags = val & PTE_FLAGS_MASK;
Jeremy Fitzhardinged8355ac2008-07-03 22:10:18 -0700550 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700551 }
552
553 return val;
554}
555
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800556static pteval_t iomap_pte(pteval_t val)
557{
558 if (val & _PAGE_PRESENT) {
559 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
560 pteval_t flags = val & PTE_FLAGS_MASK;
561
562 /* We assume the pte frame number is a MFN, so
563 just use it as-is. */
564 val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
565 }
566
567 return val;
568}
569
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700570pteval_t xen_pte_val(pte_t pte)
571{
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800572 if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP))
573 return pte.pte;
574
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700575 return pte_mfn_to_pfn(pte.pte);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700576}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800577PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700578
579pgdval_t xen_pgd_val(pgd_t pgd)
580{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700581 return pte_mfn_to_pfn(pgd.pgd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700582}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800583PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700584
585pte_t xen_make_pte(pteval_t pte)
586{
Alex Nixon7347b402010-02-19 13:31:06 -0500587 phys_addr_t addr = (pte & PTE_PFN_MASK);
588
589 /*
590 * Unprivileged domains are allowed to do IOMAPpings for
591 * PCI passthrough, but not map ISA space. The ISA
592 * mappings are just dummy local mappings to keep other
593 * parts of the kernel happy.
594 */
595 if (unlikely(pte & _PAGE_IOMAP) &&
596 (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800597 pte = iomap_pte(pte);
Alex Nixon7347b402010-02-19 13:31:06 -0500598 } else {
599 pte &= ~_PAGE_IOMAP;
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800600 pte = pte_pfn_to_mfn(pte);
Alex Nixon7347b402010-02-19 13:31:06 -0500601 }
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800602
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700603 return native_make_pte(pte);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700604}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800605PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700606
607pgd_t xen_make_pgd(pgdval_t pgd)
608{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700609 pgd = pte_pfn_to_mfn(pgd);
610 return native_make_pgd(pgd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700611}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800612PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700613
614pmdval_t xen_pmd_val(pmd_t pmd)
615{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700616 return pte_mfn_to_pfn(pmd.pmd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700617}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800618PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
Jeremy Fitzhardinge28499142008-05-09 12:05:57 +0100619
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100620void xen_set_pud_hyper(pud_t *ptr, pud_t val)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700621{
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700622 struct mmu_update u;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700623
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700624 preempt_disable();
625
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700626 xen_mc_batch();
627
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700628 /* ptr may be ioremapped for 64-bit pagetable setup */
629 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700630 u.val = pud_val_ma(val);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700631 xen_extend_mmu_update(&u);
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700632
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700633 ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
634
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700635 xen_mc_issue(PARAVIRT_LAZY_MMU);
636
637 preempt_enable();
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700638}
639
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100640void xen_set_pud(pud_t *ptr, pud_t val)
641{
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700642 ADD_STATS(pud_update, 1);
643
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100644 /* If page is not pinned, we can just update the entry
645 directly */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700646 if (!xen_page_pinned(ptr)) {
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100647 *ptr = val;
648 return;
649 }
650
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700651 ADD_STATS(pud_update_pinned, 1);
652
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100653 xen_set_pud_hyper(ptr, val);
654}
655
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700656void xen_set_pte(pte_t *ptep, pte_t pte)
657{
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800658 if (xen_iomap_pte(pte)) {
659 xen_set_iomap_pte(ptep, pte);
660 return;
661 }
662
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700663 ADD_STATS(pte_update, 1);
664// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
665 ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
666
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700667#ifdef CONFIG_X86_PAE
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700668 ptep->pte_high = pte.pte_high;
669 smp_wmb();
670 ptep->pte_low = pte.pte_low;
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700671#else
672 *ptep = pte;
673#endif
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700674}
675
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700676#ifdef CONFIG_X86_PAE
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700677void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
678{
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800679 if (xen_iomap_pte(pte)) {
680 xen_set_iomap_pte(ptep, pte);
681 return;
682 }
683
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700684 set_64bit((u64 *)ptep, native_pte_val(pte));
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700685}
686
687void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
688{
689 ptep->pte_low = 0;
690 smp_wmb(); /* make sure low gets written first */
691 ptep->pte_high = 0;
692}
693
694void xen_pmd_clear(pmd_t *pmdp)
695{
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100696 set_pmd(pmdp, __pmd(0));
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700697}
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700698#endif /* CONFIG_X86_PAE */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700699
Jeremy Fitzhardingeabf33032008-03-17 16:37:07 -0700700pmd_t xen_make_pmd(pmdval_t pmd)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700701{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700702 pmd = pte_pfn_to_mfn(pmd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700703 return native_make_pmd(pmd);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700704}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800705PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700706
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700707#if PAGETABLE_LEVELS == 4
708pudval_t xen_pud_val(pud_t pud)
709{
710 return pte_mfn_to_pfn(pud.pud);
711}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800712PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700713
714pud_t xen_make_pud(pudval_t pud)
715{
716 pud = pte_pfn_to_mfn(pud);
717
718 return native_make_pud(pud);
719}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800720PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700721
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700722pgd_t *xen_get_user_pgd(pgd_t *pgd)
723{
724 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
725 unsigned offset = pgd - pgd_page;
726 pgd_t *user_ptr = NULL;
727
728 if (offset < pgd_index(USER_LIMIT)) {
729 struct page *page = virt_to_page(pgd_page);
730 user_ptr = (pgd_t *)page->private;
731 if (user_ptr)
732 user_ptr += offset;
733 }
734
735 return user_ptr;
736}
737
738static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700739{
740 struct mmu_update u;
741
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700742 u.ptr = virt_to_machine(ptr).maddr;
743 u.val = pgd_val_ma(val);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700744 xen_extend_mmu_update(&u);
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700745}
746
747/*
748 * Raw hypercall-based set_pgd, intended for in early boot before
749 * there's a page structure. This implies:
750 * 1. The only existing pagetable is the kernel's
751 * 2. It is always pinned
752 * 3. It has no user pagetable attached to it
753 */
754void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
755{
756 preempt_disable();
757
758 xen_mc_batch();
759
760 __xen_set_pgd_hyper(ptr, val);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700761
762 xen_mc_issue(PARAVIRT_LAZY_MMU);
763
764 preempt_enable();
765}
766
767void xen_set_pgd(pgd_t *ptr, pgd_t val)
768{
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700769 pgd_t *user_ptr = xen_get_user_pgd(ptr);
770
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700771 ADD_STATS(pgd_update, 1);
772
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700773 /* If page is not pinned, we can just update the entry
774 directly */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700775 if (!xen_page_pinned(ptr)) {
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700776 *ptr = val;
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700777 if (user_ptr) {
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700778 WARN_ON(xen_page_pinned(user_ptr));
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700779 *user_ptr = val;
780 }
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700781 return;
782 }
783
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700784 ADD_STATS(pgd_update_pinned, 1);
785 ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
786
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700787 /* If it's pinned, then we can at least batch the kernel and
788 user updates together. */
789 xen_mc_batch();
790
791 __xen_set_pgd_hyper(ptr, val);
792 if (user_ptr)
793 __xen_set_pgd_hyper(user_ptr, val);
794
795 xen_mc_issue(PARAVIRT_LAZY_MMU);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700796}
797#endif /* PAGETABLE_LEVELS == 4 */
798
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700799/*
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700800 * (Yet another) pagetable walker. This one is intended for pinning a
801 * pagetable. This means that it walks a pagetable and calls the
802 * callback function on each page it finds making up the page table,
803 * at every level. It walks the entire pagetable, but it only bothers
804 * pinning pte pages which are below limit. In the normal case this
805 * will be STACK_TOP_MAX, but at boot we need to pin up to
806 * FIXADDR_TOP.
807 *
808 * For 32-bit the important bit is that we don't pin beyond there,
809 * because then we start getting into Xen's ptes.
810 *
811 * For 64-bit, we must skip the Xen hole in the middle of the address
812 * space, just after the big x86-64 virtual hole.
813 */
Ian Campbell86bbc2c2008-11-21 10:21:33 +0000814static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
815 int (*func)(struct mm_struct *mm, struct page *,
816 enum pt_level),
817 unsigned long limit)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700818{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700819 int flush = 0;
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700820 unsigned hole_low, hole_high;
821 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
822 unsigned pgdidx, pudidx, pmdidx;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700823
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700824 /* The limit is the last byte to be touched */
825 limit--;
826 BUG_ON(limit >= FIXADDR_TOP);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700827
828 if (xen_feature(XENFEAT_auto_translated_physmap))
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700829 return 0;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700830
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700831 /*
832 * 64-bit has a great big hole in the middle of the address
833 * space, which contains the Xen mappings. On 32-bit these
834 * will end up making a zero-sized hole and so is a no-op.
835 */
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700836 hole_low = pgd_index(USER_LIMIT);
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700837 hole_high = pgd_index(PAGE_OFFSET);
838
839 pgdidx_limit = pgd_index(limit);
840#if PTRS_PER_PUD > 1
841 pudidx_limit = pud_index(limit);
842#else
843 pudidx_limit = 0;
844#endif
845#if PTRS_PER_PMD > 1
846 pmdidx_limit = pmd_index(limit);
847#else
848 pmdidx_limit = 0;
849#endif
850
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700851 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700852 pud_t *pud;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700853
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700854 if (pgdidx >= hole_low && pgdidx < hole_high)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700855 continue;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700856
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700857 if (!pgd_val(pgd[pgdidx]))
858 continue;
859
860 pud = pud_offset(&pgd[pgdidx], 0);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700861
862 if (PTRS_PER_PUD > 1) /* not folded */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700863 flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700864
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700865 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700866 pmd_t *pmd;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700867
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700868 if (pgdidx == pgdidx_limit &&
869 pudidx > pudidx_limit)
870 goto out;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700871
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700872 if (pud_none(pud[pudidx]))
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700873 continue;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700874
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700875 pmd = pmd_offset(&pud[pudidx], 0);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700876
877 if (PTRS_PER_PMD > 1) /* not folded */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700878 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700879
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700880 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
881 struct page *pte;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700882
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700883 if (pgdidx == pgdidx_limit &&
884 pudidx == pudidx_limit &&
885 pmdidx > pmdidx_limit)
886 goto out;
887
888 if (pmd_none(pmd[pmdidx]))
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700889 continue;
890
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700891 pte = pmd_page(pmd[pmdidx]);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700892 flush |= (*func)(mm, pte, PT_PTE);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700893 }
894 }
895 }
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700896
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -0700897out:
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700898 /* Do the top level last, so that the callbacks can use it as
899 a cue to do final things like tlb flushes. */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700900 flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700901
902 return flush;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700903}
904
Ian Campbell86bbc2c2008-11-21 10:21:33 +0000905static int xen_pgd_walk(struct mm_struct *mm,
906 int (*func)(struct mm_struct *mm, struct page *,
907 enum pt_level),
908 unsigned long limit)
909{
910 return __xen_pgd_walk(mm, mm->pgd, func, limit);
911}
912
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700913/* If we're using split pte locks, then take the page's lock and
914 return a pointer to it. Otherwise return NULL. */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700915static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700916{
917 spinlock_t *ptl = NULL;
918
Jeremy Fitzhardingef7d0b922008-09-09 15:43:22 -0700919#if USE_SPLIT_PTLOCKS
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700920 ptl = __pte_lockptr(page);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700921 spin_lock_nest_lock(ptl, &mm->page_table_lock);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700922#endif
923
924 return ptl;
925}
926
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700927static void xen_pte_unlock(void *v)
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700928{
929 spinlock_t *ptl = v;
930 spin_unlock(ptl);
931}
932
933static void xen_do_pin(unsigned level, unsigned long pfn)
934{
935 struct mmuext_op *op;
936 struct multicall_space mcs;
937
938 mcs = __xen_mc_entry(sizeof(*op));
939 op = mcs.args;
940 op->cmd = level;
941 op->arg1.mfn = pfn_to_mfn(pfn);
942 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
943}
944
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700945static int xen_pin_page(struct mm_struct *mm, struct page *page,
946 enum pt_level level)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700947{
Christoph Lameterd60cd462008-04-28 02:12:51 -0700948 unsigned pgfl = TestSetPagePinned(page);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700949 int flush;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700950
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700951 if (pgfl)
952 flush = 0; /* already pinned */
953 else if (PageHighMem(page))
954 /* kmaps need flushing if we found an unpinned
955 highpage */
956 flush = 1;
957 else {
958 void *pt = lowmem_page_address(page);
959 unsigned long pfn = page_to_pfn(page);
960 struct multicall_space mcs = __xen_mc_entry(0);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700961 spinlock_t *ptl;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700962
963 flush = 0;
964
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700965 /*
966 * We need to hold the pagetable lock between the time
967 * we make the pagetable RO and when we actually pin
968 * it. If we don't, then other users may come in and
969 * attempt to update the pagetable by writing it,
970 * which will fail because the memory is RO but not
971 * pinned, so Xen won't do the trap'n'emulate.
972 *
973 * If we're using split pte locks, we can't hold the
974 * entire pagetable's worth of locks during the
975 * traverse, because we may wrap the preempt count (8
976 * bits). The solution is to mark RO and pin each PTE
977 * page while holding the lock. This means the number
978 * of locks we end up holding is never more than a
979 * batch size (~32 entries, at present).
980 *
981 * If we're not using split pte locks, we needn't pin
982 * the PTE pages independently, because we're
983 * protected by the overall pagetable lock.
984 */
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700985 ptl = NULL;
986 if (level == PT_PTE)
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -0700987 ptl = xen_pte_lock(page, mm);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700988
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700989 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
990 pfn_pte(pfn, PAGE_KERNEL_RO),
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700991 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
992
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -0700993 if (ptl) {
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700994 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
995
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700996 /* Queue a deferred unlock for when this batch
997 is completed. */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700998 xen_mc_callback(xen_pte_unlock, ptl);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700999 }
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001000 }
1001
1002 return flush;
1003}
1004
1005/* This is called just after a mm has been created, but it has not
1006 been used yet. We need to make sure that its pagetable is all
1007 read-only, and can be pinned. */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001008static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001009{
Jeremy Fitzhardinged05fdf32008-10-28 19:23:06 +11001010 vm_unmap_aliases();
1011
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001012 xen_mc_batch();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001013
Ian Campbell86bbc2c2008-11-21 10:21:33 +00001014 if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
Jeremy Fitzhardinged05fdf32008-10-28 19:23:06 +11001015 /* re-enable interrupts for flushing */
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001016 xen_mc_issue(0);
Jeremy Fitzhardinged05fdf32008-10-28 19:23:06 +11001017
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001018 kmap_flush_unused();
Jeremy Fitzhardinged05fdf32008-10-28 19:23:06 +11001019
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001020 xen_mc_batch();
1021 }
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001022
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -07001023#ifdef CONFIG_X86_64
1024 {
1025 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1026
1027 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
1028
1029 if (user_pgd) {
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001030 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
Tejf63c2f22008-12-16 11:56:06 -08001031 xen_do_pin(MMUEXT_PIN_L4_TABLE,
1032 PFN_DOWN(__pa(user_pgd)));
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -07001033 }
1034 }
1035#else /* CONFIG_X86_32 */
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001036#ifdef CONFIG_X86_PAE
1037 /* Need to make sure unshared kernel PMD is pinnable */
Jeremy Fitzhardinge47cb2ed2008-11-06 13:48:24 -08001038 xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001039 PT_PMD);
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001040#endif
Jeremy Fitzhardinge28499142008-05-09 12:05:57 +01001041 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -07001042#endif /* CONFIG_X86_64 */
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001043 xen_mc_issue(0);
1044}
1045
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001046static void xen_pgd_pin(struct mm_struct *mm)
1047{
1048 __xen_pgd_pin(mm, mm->pgd);
1049}
1050
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001051/*
1052 * On save, we need to pin all pagetables to make sure they get their
1053 * mfns turned into pfns. Search the list for any unpinned pgds and pin
1054 * them (unpinned pgds are not currently in use, probably because the
1055 * process is under construction or destruction).
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001056 *
1057 * Expected to be called in stop_machine() ("equivalent to taking
1058 * every spinlock in the system"), so the locking doesn't really
1059 * matter all that much.
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001060 */
1061void xen_mm_pin_all(void)
1062{
1063 unsigned long flags;
1064 struct page *page;
1065
1066 spin_lock_irqsave(&pgd_lock, flags);
1067
1068 list_for_each_entry(page, &pgd_list, lru) {
1069 if (!PagePinned(page)) {
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001070 __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001071 SetPageSavePinned(page);
1072 }
1073 }
1074
1075 spin_unlock_irqrestore(&pgd_lock, flags);
1076}
1077
Eduardo Habkostc1f2f092008-07-08 15:06:24 -07001078/*
1079 * The init_mm pagetable is really pinned as soon as its created, but
1080 * that's before we have page structures to store the bits. So do all
1081 * the book-keeping now.
1082 */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001083static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
1084 enum pt_level level)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001085{
1086 SetPagePinned(page);
1087 return 0;
1088}
1089
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001090static void __init xen_mark_init_mm_pinned(void)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001091{
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001092 xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001093}
1094
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001095static int xen_unpin_page(struct mm_struct *mm, struct page *page,
1096 enum pt_level level)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001097{
Christoph Lameterd60cd462008-04-28 02:12:51 -07001098 unsigned pgfl = TestClearPagePinned(page);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001099
1100 if (pgfl && !PageHighMem(page)) {
1101 void *pt = lowmem_page_address(page);
1102 unsigned long pfn = page_to_pfn(page);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001103 spinlock_t *ptl = NULL;
1104 struct multicall_space mcs;
1105
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -07001106 /*
1107 * Do the converse to pin_page. If we're using split
1108 * pte locks, we must be holding the lock for while
1109 * the pte page is unpinned but still RO to prevent
1110 * concurrent updates from seeing it in this
1111 * partially-pinned state.
1112 */
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001113 if (level == PT_PTE) {
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001114 ptl = xen_pte_lock(page, mm);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001115
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -07001116 if (ptl)
1117 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001118 }
1119
1120 mcs = __xen_mc_entry(0);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001121
1122 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
1123 pfn_pte(pfn, PAGE_KERNEL),
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001124 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
1125
1126 if (ptl) {
1127 /* unlock when batch completed */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001128 xen_mc_callback(xen_pte_unlock, ptl);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001129 }
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001130 }
1131
1132 return 0; /* never need to flush on unpin */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001133}
1134
1135/* Release a pagetables pages back as normal RW */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001136static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001137{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001138 xen_mc_batch();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001139
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001140 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001141
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -07001142#ifdef CONFIG_X86_64
1143 {
1144 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1145
1146 if (user_pgd) {
Tejf63c2f22008-12-16 11:56:06 -08001147 xen_do_pin(MMUEXT_UNPIN_TABLE,
1148 PFN_DOWN(__pa(user_pgd)));
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001149 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -07001150 }
1151 }
1152#endif
1153
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001154#ifdef CONFIG_X86_PAE
1155 /* Need to make sure unshared kernel PMD is unpinned */
Jeremy Fitzhardinge47cb2ed2008-11-06 13:48:24 -08001156 xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001157 PT_PMD);
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001158#endif
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -07001159
Ian Campbell86bbc2c2008-11-21 10:21:33 +00001160 __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001161
1162 xen_mc_issue(0);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001163}
1164
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001165static void xen_pgd_unpin(struct mm_struct *mm)
1166{
1167 __xen_pgd_unpin(mm, mm->pgd);
1168}
1169
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001170/*
1171 * On resume, undo any pinning done at save, so that the rest of the
1172 * kernel doesn't see any unexpected pinned pagetables.
1173 */
1174void xen_mm_unpin_all(void)
1175{
1176 unsigned long flags;
1177 struct page *page;
1178
1179 spin_lock_irqsave(&pgd_lock, flags);
1180
1181 list_for_each_entry(page, &pgd_list, lru) {
1182 if (PageSavePinned(page)) {
1183 BUG_ON(!PagePinned(page));
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001184 __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001185 ClearPageSavePinned(page);
1186 }
1187 }
1188
1189 spin_unlock_irqrestore(&pgd_lock, flags);
1190}
1191
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001192void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
1193{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001194 spin_lock(&next->page_table_lock);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001195 xen_pgd_pin(next);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001196 spin_unlock(&next->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001197}
1198
1199void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
1200{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001201 spin_lock(&mm->page_table_lock);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001202 xen_pgd_pin(mm);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001203 spin_unlock(&mm->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001204}
1205
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001206
1207#ifdef CONFIG_SMP
1208/* Another cpu may still have their %cr3 pointing at the pagetable, so
1209 we need to repoint it somewhere else before we can unpin it. */
1210static void drop_other_mm_ref(void *info)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001211{
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001212 struct mm_struct *mm = info;
Jeremy Fitzhardingece87b3d2008-07-08 15:06:40 -07001213 struct mm_struct *active_mm;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001214
Brian Gerst9eb912d2009-01-19 00:38:57 +09001215 active_mm = percpu_read(cpu_tlbstate.active_mm);
Jeremy Fitzhardingece87b3d2008-07-08 15:06:40 -07001216
1217 if (active_mm == mm)
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001218 leave_mm(smp_processor_id());
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001219
1220 /* If this cpu still has a stale cr3 reference, then make sure
1221 it has been flushed. */
Jeremy Fitzhardinge7fd7d832009-02-17 23:24:03 -08001222 if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001223 load_cr3(swapper_pg_dir);
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001224}
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001225
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001226static void xen_drop_mm_ref(struct mm_struct *mm)
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001227{
Mike Travise4d98202008-12-16 17:34:05 -08001228 cpumask_var_t mask;
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001229 unsigned cpu;
1230
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001231 if (current->active_mm == mm) {
1232 if (current->mm == mm)
1233 load_cr3(swapper_pg_dir);
1234 else
1235 leave_mm(smp_processor_id());
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001236 }
1237
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001238 /* Get the "official" set of cpus referring to our pagetable. */
Mike Travise4d98202008-12-16 17:34:05 -08001239 if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1240 for_each_online_cpu(cpu) {
Rusty Russell78f1c4d2009-09-24 09:34:51 -06001241 if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
Mike Travise4d98202008-12-16 17:34:05 -08001242 && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1243 continue;
1244 smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1245 }
1246 return;
1247 }
Rusty Russell78f1c4d2009-09-24 09:34:51 -06001248 cpumask_copy(mask, mm_cpumask(mm));
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001249
1250 /* It's possible that a vcpu may have a stale reference to our
1251 cr3, because its in lazy mode, and it hasn't yet flushed
1252 its set of pending hypercalls yet. In this case, we can
1253 look at its actual current cr3 value, and force it to flush
1254 if needed. */
1255 for_each_online_cpu(cpu) {
1256 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
Mike Travise4d98202008-12-16 17:34:05 -08001257 cpumask_set_cpu(cpu, mask);
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001258 }
1259
Mike Travise4d98202008-12-16 17:34:05 -08001260 if (!cpumask_empty(mask))
1261 smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1262 free_cpumask_var(mask);
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001263}
1264#else
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001265static void xen_drop_mm_ref(struct mm_struct *mm)
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001266{
1267 if (current->active_mm == mm)
1268 load_cr3(swapper_pg_dir);
1269}
1270#endif
1271
1272/*
1273 * While a process runs, Xen pins its pagetables, which means that the
1274 * hypervisor forces it to be read-only, and it controls all updates
1275 * to it. This means that all pagetable updates have to go via the
1276 * hypervisor, which is moderately expensive.
1277 *
1278 * Since we're pulling the pagetable down, we switch to use init_mm,
1279 * unpin old process pagetable and mark it all read-write, which
1280 * allows further operations on it to be simple memory accesses.
1281 *
1282 * The only subtle point is that another CPU may be still using the
1283 * pagetable because of lazy tlb flushing. This means we need need to
1284 * switch all CPUs off this pagetable before we can unpin it.
1285 */
1286void xen_exit_mmap(struct mm_struct *mm)
1287{
1288 get_cpu(); /* make sure we don't move around */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001289 xen_drop_mm_ref(mm);
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001290 put_cpu();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001291
Jeremy Fitzhardingef120f132007-07-17 18:37:06 -07001292 spin_lock(&mm->page_table_lock);
Jeremy Fitzhardingedf912ea2007-09-25 11:50:00 -07001293
1294 /* pgd may not be pinned in the error exit path of execve */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001295 if (xen_page_pinned(mm->pgd))
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001296 xen_pgd_unpin(mm);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001297
Jeremy Fitzhardingef120f132007-07-17 18:37:06 -07001298 spin_unlock(&mm->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001299}
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -07001300
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001301static __init void xen_pagetable_setup_start(pgd_t *base)
1302{
1303}
1304
Thomas Gleixnerf1d70622009-08-20 13:13:52 +02001305static void xen_post_allocator_init(void);
1306
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001307static __init void xen_pagetable_setup_done(pgd_t *base)
1308{
1309 xen_setup_shared_info();
Thomas Gleixnerf1d70622009-08-20 13:13:52 +02001310 xen_post_allocator_init();
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001311}
1312
1313static void xen_write_cr2(unsigned long cr2)
1314{
1315 percpu_read(xen_vcpu)->arch.cr2 = cr2;
1316}
1317
1318static unsigned long xen_read_cr2(void)
1319{
1320 return percpu_read(xen_vcpu)->arch.cr2;
1321}
1322
1323unsigned long xen_read_cr2_direct(void)
1324{
1325 return percpu_read(xen_vcpu_info.arch.cr2);
1326}
1327
1328static void xen_flush_tlb(void)
1329{
1330 struct mmuext_op *op;
1331 struct multicall_space mcs;
1332
1333 preempt_disable();
1334
1335 mcs = xen_mc_entry(sizeof(*op));
1336
1337 op = mcs.args;
1338 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1339 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1340
1341 xen_mc_issue(PARAVIRT_LAZY_MMU);
1342
1343 preempt_enable();
1344}
1345
1346static void xen_flush_tlb_single(unsigned long addr)
1347{
1348 struct mmuext_op *op;
1349 struct multicall_space mcs;
1350
1351 preempt_disable();
1352
1353 mcs = xen_mc_entry(sizeof(*op));
1354 op = mcs.args;
1355 op->cmd = MMUEXT_INVLPG_LOCAL;
1356 op->arg1.linear_addr = addr & PAGE_MASK;
1357 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1358
1359 xen_mc_issue(PARAVIRT_LAZY_MMU);
1360
1361 preempt_enable();
1362}
1363
1364static void xen_flush_tlb_others(const struct cpumask *cpus,
1365 struct mm_struct *mm, unsigned long va)
1366{
1367 struct {
1368 struct mmuext_op op;
1369 DECLARE_BITMAP(mask, NR_CPUS);
1370 } *args;
1371 struct multicall_space mcs;
1372
Jeremy Fitzhardingee3f8a742009-03-04 17:36:57 -08001373 if (cpumask_empty(cpus))
1374 return; /* nothing to do */
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001375
1376 mcs = xen_mc_entry(sizeof(*args));
1377 args = mcs.args;
1378 args->op.arg2.vcpumask = to_cpumask(args->mask);
1379
1380 /* Remove us, and any offline CPUS. */
1381 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1382 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001383
1384 if (va == TLB_FLUSH_ALL) {
1385 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1386 } else {
1387 args->op.cmd = MMUEXT_INVLPG_MULTI;
1388 args->op.arg1.linear_addr = va;
1389 }
1390
1391 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1392
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001393 xen_mc_issue(PARAVIRT_LAZY_MMU);
1394}
1395
1396static unsigned long xen_read_cr3(void)
1397{
1398 return percpu_read(xen_cr3);
1399}
1400
1401static void set_current_cr3(void *v)
1402{
1403 percpu_write(xen_current_cr3, (unsigned long)v);
1404}
1405
1406static void __xen_write_cr3(bool kernel, unsigned long cr3)
1407{
1408 struct mmuext_op *op;
1409 struct multicall_space mcs;
1410 unsigned long mfn;
1411
1412 if (cr3)
1413 mfn = pfn_to_mfn(PFN_DOWN(cr3));
1414 else
1415 mfn = 0;
1416
1417 WARN_ON(mfn == 0 && kernel);
1418
1419 mcs = __xen_mc_entry(sizeof(*op));
1420
1421 op = mcs.args;
1422 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1423 op->arg1.mfn = mfn;
1424
1425 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1426
1427 if (kernel) {
1428 percpu_write(xen_cr3, cr3);
1429
1430 /* Update xen_current_cr3 once the batch has actually
1431 been submitted. */
1432 xen_mc_callback(set_current_cr3, (void *)cr3);
1433 }
1434}
1435
1436static void xen_write_cr3(unsigned long cr3)
1437{
1438 BUG_ON(preemptible());
1439
1440 xen_mc_batch(); /* disables interrupts */
1441
1442 /* Update while interrupts are disabled, so its atomic with
1443 respect to ipis */
1444 percpu_write(xen_cr3, cr3);
1445
1446 __xen_write_cr3(true, cr3);
1447
1448#ifdef CONFIG_X86_64
1449 {
1450 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1451 if (user_pgd)
1452 __xen_write_cr3(false, __pa(user_pgd));
1453 else
1454 __xen_write_cr3(false, 0);
1455 }
1456#endif
1457
1458 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
1459}
1460
1461static int xen_pgd_alloc(struct mm_struct *mm)
1462{
1463 pgd_t *pgd = mm->pgd;
1464 int ret = 0;
1465
1466 BUG_ON(PagePinned(virt_to_page(pgd)));
1467
1468#ifdef CONFIG_X86_64
1469 {
1470 struct page *page = virt_to_page(pgd);
1471 pgd_t *user_pgd;
1472
1473 BUG_ON(page->private != 0);
1474
1475 ret = -ENOMEM;
1476
1477 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1478 page->private = (unsigned long)user_pgd;
1479
1480 if (user_pgd != NULL) {
1481 user_pgd[pgd_index(VSYSCALL_START)] =
1482 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1483 ret = 0;
1484 }
1485
1486 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1487 }
1488#endif
1489
1490 return ret;
1491}
1492
1493static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1494{
1495#ifdef CONFIG_X86_64
1496 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1497
1498 if (user_pgd)
1499 free_page((unsigned long)user_pgd);
1500#endif
1501}
1502
Jeremy Fitzhardinge1f4f9312009-02-02 13:58:06 -08001503#ifdef CONFIG_X86_32
1504static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1505{
1506 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1507 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1508 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1509 pte_val_ma(pte));
1510
1511 return pte;
1512}
1513
1514/* Init-time set_pte while constructing initial pagetables, which
1515 doesn't allow RO pagetable pages to be remapped RW */
1516static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1517{
1518 pte = mask_rw_pte(ptep, pte);
1519
1520 xen_set_pte(ptep, pte);
1521}
1522#endif
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001523
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001524static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1525{
1526 struct mmuext_op op;
1527 op.cmd = cmd;
1528 op.arg1.mfn = pfn_to_mfn(pfn);
1529 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1530 BUG();
1531}
1532
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001533/* Early in boot, while setting up the initial pagetable, assume
1534 everything is pinned. */
1535static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1536{
1537#ifdef CONFIG_FLATMEM
1538 BUG_ON(mem_map); /* should only be used early */
1539#endif
1540 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001541 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1542}
1543
1544/* Used for pmd and pud */
1545static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1546{
1547#ifdef CONFIG_FLATMEM
1548 BUG_ON(mem_map); /* should only be used early */
1549#endif
1550 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001551}
1552
1553/* Early release_pte assumes that all pts are pinned, since there's
1554 only init_mm and anything attached to that is pinned. */
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001555static __init void xen_release_pte_init(unsigned long pfn)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001556{
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001557 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001558 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1559}
1560
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001561static __init void xen_release_pmd_init(unsigned long pfn)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001562{
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001563 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001564}
1565
1566/* This needs to make sure the new pte page is pinned iff its being
1567 attached to a pinned pagetable. */
1568static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
1569{
1570 struct page *page = pfn_to_page(pfn);
1571
1572 if (PagePinned(virt_to_page(mm->pgd))) {
1573 SetPagePinned(page);
1574
1575 vm_unmap_aliases();
1576 if (!PageHighMem(page)) {
1577 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
1578 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1579 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1580 } else {
1581 /* make sure there are no stray mappings of
1582 this page */
1583 kmap_flush_unused();
1584 }
1585 }
1586}
1587
1588static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1589{
1590 xen_alloc_ptpage(mm, pfn, PT_PTE);
1591}
1592
1593static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1594{
1595 xen_alloc_ptpage(mm, pfn, PT_PMD);
1596}
1597
1598/* This should never happen until we're OK to use struct page */
1599static void xen_release_ptpage(unsigned long pfn, unsigned level)
1600{
1601 struct page *page = pfn_to_page(pfn);
1602
1603 if (PagePinned(page)) {
1604 if (!PageHighMem(page)) {
1605 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1606 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1607 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1608 }
1609 ClearPagePinned(page);
1610 }
1611}
1612
1613static void xen_release_pte(unsigned long pfn)
1614{
1615 xen_release_ptpage(pfn, PT_PTE);
1616}
1617
1618static void xen_release_pmd(unsigned long pfn)
1619{
1620 xen_release_ptpage(pfn, PT_PMD);
1621}
1622
1623#if PAGETABLE_LEVELS == 4
1624static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1625{
1626 xen_alloc_ptpage(mm, pfn, PT_PUD);
1627}
1628
1629static void xen_release_pud(unsigned long pfn)
1630{
1631 xen_release_ptpage(pfn, PT_PUD);
1632}
1633#endif
1634
1635void __init xen_reserve_top(void)
1636{
1637#ifdef CONFIG_X86_32
1638 unsigned long top = HYPERVISOR_VIRT_START;
1639 struct xen_platform_parameters pp;
1640
1641 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1642 top = pp.virt_start;
1643
1644 reserve_top_address(-top);
1645#endif /* CONFIG_X86_32 */
1646}
1647
1648/*
1649 * Like __va(), but returns address in the kernel mapping (which is
1650 * all we have until the physical memory mapping has been set up.
1651 */
1652static void *__ka(phys_addr_t paddr)
1653{
1654#ifdef CONFIG_X86_64
1655 return (void *)(paddr + __START_KERNEL_map);
1656#else
1657 return __va(paddr);
1658#endif
1659}
1660
1661/* Convert a machine address to physical address */
1662static unsigned long m2p(phys_addr_t maddr)
1663{
1664 phys_addr_t paddr;
1665
1666 maddr &= PTE_PFN_MASK;
1667 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1668
1669 return paddr;
1670}
1671
1672/* Convert a machine address to kernel virtual */
1673static void *m2v(phys_addr_t maddr)
1674{
1675 return __ka(m2p(maddr));
1676}
1677
1678static void set_page_prot(void *addr, pgprot_t prot)
1679{
1680 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1681 pte_t pte = pfn_pte(pfn, prot);
1682
1683 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1684 BUG();
1685}
1686
1687static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1688{
1689 unsigned pmdidx, pteidx;
1690 unsigned ident_pte;
1691 unsigned long pfn;
1692
1693 ident_pte = 0;
1694 pfn = 0;
1695 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1696 pte_t *pte_page;
1697
1698 /* Reuse or allocate a page of ptes */
1699 if (pmd_present(pmd[pmdidx]))
1700 pte_page = m2v(pmd[pmdidx].pmd);
1701 else {
1702 /* Check for free pte pages */
1703 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1704 break;
1705
1706 pte_page = &level1_ident_pgt[ident_pte];
1707 ident_pte += PTRS_PER_PTE;
1708
1709 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1710 }
1711
1712 /* Install mappings */
1713 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1714 pte_t pte;
1715
1716 if (pfn > max_pfn_mapped)
1717 max_pfn_mapped = pfn;
1718
1719 if (!pte_none(pte_page[pteidx]))
1720 continue;
1721
1722 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1723 pte_page[pteidx] = pte;
1724 }
1725 }
1726
1727 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1728 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1729
1730 set_page_prot(pmd, PAGE_KERNEL_RO);
1731}
1732
1733#ifdef CONFIG_X86_64
1734static void convert_pfn_mfn(void *v)
1735{
1736 pte_t *pte = v;
1737 int i;
1738
1739 /* All levels are converted the same way, so just treat them
1740 as ptes. */
1741 for (i = 0; i < PTRS_PER_PTE; i++)
1742 pte[i] = xen_make_pte(pte[i].pte);
1743}
1744
1745/*
1746 * Set up the inital kernel pagetable.
1747 *
1748 * We can construct this by grafting the Xen provided pagetable into
1749 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1750 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1751 * means that only the kernel has a physical mapping to start with -
1752 * but that's enough to get __va working. We need to fill in the rest
1753 * of the physical mapping once some sort of allocator has been set
1754 * up.
1755 */
1756__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1757 unsigned long max_pfn)
1758{
1759 pud_t *l3;
1760 pmd_t *l2;
1761
1762 /* Zap identity mapping */
1763 init_level4_pgt[0] = __pgd(0);
1764
1765 /* Pre-constructed entries are in pfn, so convert to mfn */
1766 convert_pfn_mfn(init_level4_pgt);
1767 convert_pfn_mfn(level3_ident_pgt);
1768 convert_pfn_mfn(level3_kernel_pgt);
1769
1770 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1771 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1772
1773 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1774 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1775
1776 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1777 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1778 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1779
1780 /* Set up identity map */
1781 xen_map_identity_early(level2_ident_pgt, max_pfn);
1782
1783 /* Make pagetable pieces RO */
1784 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1785 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1786 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1787 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1788 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1789 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1790
1791 /* Pin down new L4 */
1792 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1793 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1794
1795 /* Unpin Xen-provided one */
1796 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1797
1798 /* Switch over */
1799 pgd = init_level4_pgt;
1800
1801 /*
1802 * At this stage there can be no user pgd, and no page
1803 * structure to attach it to, so make sure we just set kernel
1804 * pgd.
1805 */
1806 xen_mc_batch();
1807 __xen_write_cr3(true, __pa(pgd));
1808 xen_mc_issue(PARAVIRT_LAZY_CPU);
1809
1810 reserve_early(__pa(xen_start_info->pt_base),
1811 __pa(xen_start_info->pt_base +
1812 xen_start_info->nr_pt_frames * PAGE_SIZE),
1813 "XEN PAGETABLES");
1814
1815 return pgd;
1816}
1817#else /* !CONFIG_X86_64 */
1818static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1819
1820__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1821 unsigned long max_pfn)
1822{
1823 pmd_t *kernel_pmd;
1824
Jeremy Fitzhardinge93dbda72009-02-26 17:35:44 -08001825 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
1826 xen_start_info->nr_pt_frames * PAGE_SIZE +
1827 512*1024);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001828
1829 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1830 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1831
1832 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1833
1834 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1835 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1836 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1837
1838 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1839 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1840 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1841
1842 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1843
1844 xen_write_cr3(__pa(swapper_pg_dir));
1845
1846 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1847
Jeremy Fitzhardinge33df4db2009-05-07 11:56:44 -07001848 reserve_early(__pa(xen_start_info->pt_base),
1849 __pa(xen_start_info->pt_base +
1850 xen_start_info->nr_pt_frames * PAGE_SIZE),
1851 "XEN PAGETABLES");
1852
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001853 return swapper_pg_dir;
1854}
1855#endif /* CONFIG_X86_64 */
1856
Masami Hiramatsu3b3809a2009-04-09 10:55:33 -07001857static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001858{
1859 pte_t pte;
1860
1861 phys >>= PAGE_SHIFT;
1862
1863 switch (idx) {
1864 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1865#ifdef CONFIG_X86_F00F_BUG
1866 case FIX_F00F_IDT:
1867#endif
1868#ifdef CONFIG_X86_32
1869 case FIX_WP_TEST:
1870 case FIX_VDSO:
1871# ifdef CONFIG_HIGHMEM
1872 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1873# endif
1874#else
1875 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1876#endif
1877#ifdef CONFIG_X86_LOCAL_APIC
1878 case FIX_APIC_BASE: /* maps dummy local APIC */
1879#endif
Jeremy Fitzhardinge3ecb1b72009-03-07 23:48:41 -08001880 case FIX_TEXT_POKE0:
1881 case FIX_TEXT_POKE1:
1882 /* All local page mappings */
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001883 pte = pfn_pte(phys, prot);
1884 break;
1885
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -08001886 case FIX_PARAVIRT_BOOTMAP:
1887 /* This is an MFN, but it isn't an IO mapping from the
1888 IO domain */
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001889 pte = mfn_pte(phys, prot);
1890 break;
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -08001891
1892 default:
1893 /* By default, set_fixmap is used for hardware mappings */
1894 pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
1895 break;
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001896 }
1897
1898 __native_set_fixmap(idx, pte);
1899
1900#ifdef CONFIG_X86_64
1901 /* Replicate changes to map the vsyscall page into the user
1902 pagetable vsyscall mapping. */
1903 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1904 unsigned long vaddr = __fix_to_virt(idx);
1905 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1906 }
1907#endif
1908}
1909
Thomas Gleixnerf1d70622009-08-20 13:13:52 +02001910static __init void xen_post_allocator_init(void)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001911{
1912 pv_mmu_ops.set_pte = xen_set_pte;
1913 pv_mmu_ops.set_pmd = xen_set_pmd;
1914 pv_mmu_ops.set_pud = xen_set_pud;
1915#if PAGETABLE_LEVELS == 4
1916 pv_mmu_ops.set_pgd = xen_set_pgd;
1917#endif
1918
1919 /* This will work as long as patching hasn't happened yet
1920 (which it hasn't) */
1921 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1922 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1923 pv_mmu_ops.release_pte = xen_release_pte;
1924 pv_mmu_ops.release_pmd = xen_release_pmd;
1925#if PAGETABLE_LEVELS == 4
1926 pv_mmu_ops.alloc_pud = xen_alloc_pud;
1927 pv_mmu_ops.release_pud = xen_release_pud;
1928#endif
1929
1930#ifdef CONFIG_X86_64
1931 SetPagePinned(virt_to_page(level3_user_vsyscall));
1932#endif
1933 xen_mark_init_mm_pinned();
1934}
1935
Jeremy Fitzhardingeb407fc52009-02-17 23:46:21 -08001936static void xen_leave_lazy_mmu(void)
1937{
Jeremy Fitzhardinge5caecb92009-02-20 23:01:26 -08001938 preempt_disable();
Jeremy Fitzhardingeb407fc52009-02-17 23:46:21 -08001939 xen_mc_flush();
1940 paravirt_leave_lazy_mmu();
Jeremy Fitzhardinge5caecb92009-02-20 23:01:26 -08001941 preempt_enable();
Jeremy Fitzhardingeb407fc52009-02-17 23:46:21 -08001942}
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001943
Thomas Gleixner030cb6c2009-08-20 14:30:02 +02001944static const struct pv_mmu_ops xen_mmu_ops __initdata = {
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001945 .read_cr2 = xen_read_cr2,
1946 .write_cr2 = xen_write_cr2,
1947
1948 .read_cr3 = xen_read_cr3,
1949 .write_cr3 = xen_write_cr3,
1950
1951 .flush_tlb_user = xen_flush_tlb,
1952 .flush_tlb_kernel = xen_flush_tlb,
1953 .flush_tlb_single = xen_flush_tlb_single,
1954 .flush_tlb_others = xen_flush_tlb_others,
1955
1956 .pte_update = paravirt_nop,
1957 .pte_update_defer = paravirt_nop,
1958
1959 .pgd_alloc = xen_pgd_alloc,
1960 .pgd_free = xen_pgd_free,
1961
1962 .alloc_pte = xen_alloc_pte_init,
1963 .release_pte = xen_release_pte_init,
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001964 .alloc_pmd = xen_alloc_pmd_init,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001965 .alloc_pmd_clone = paravirt_nop,
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001966 .release_pmd = xen_release_pmd_init,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001967
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001968#ifdef CONFIG_X86_64
1969 .set_pte = xen_set_pte,
1970#else
1971 .set_pte = xen_set_pte_init,
1972#endif
1973 .set_pte_at = xen_set_pte_at,
1974 .set_pmd = xen_set_pmd_hyper,
1975
1976 .ptep_modify_prot_start = __ptep_modify_prot_start,
1977 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1978
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08001979 .pte_val = PV_CALLEE_SAVE(xen_pte_val),
1980 .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001981
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08001982 .make_pte = PV_CALLEE_SAVE(xen_make_pte),
1983 .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001984
1985#ifdef CONFIG_X86_PAE
1986 .set_pte_atomic = xen_set_pte_atomic,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001987 .pte_clear = xen_pte_clear,
1988 .pmd_clear = xen_pmd_clear,
1989#endif /* CONFIG_X86_PAE */
1990 .set_pud = xen_set_pud_hyper,
1991
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08001992 .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
1993 .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001994
1995#if PAGETABLE_LEVELS == 4
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08001996 .pud_val = PV_CALLEE_SAVE(xen_pud_val),
1997 .make_pud = PV_CALLEE_SAVE(xen_make_pud),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001998 .set_pgd = xen_set_pgd_hyper,
1999
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07002000 .alloc_pud = xen_alloc_pmd_init,
2001 .release_pud = xen_release_pmd_init,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002002#endif /* PAGETABLE_LEVELS == 4 */
2003
2004 .activate_mm = xen_activate_mm,
2005 .dup_mmap = xen_dup_mmap,
2006 .exit_mmap = xen_exit_mmap,
2007
2008 .lazy_mode = {
2009 .enter = paravirt_enter_lazy_mmu,
Jeremy Fitzhardingeb407fc52009-02-17 23:46:21 -08002010 .leave = xen_leave_lazy_mmu,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002011 },
2012
2013 .set_fixmap = xen_set_fixmap,
2014};
2015
Thomas Gleixner030cb6c2009-08-20 14:30:02 +02002016void __init xen_init_mmu_ops(void)
2017{
2018 x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
2019 x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
2020 pv_mmu_ops = xen_mmu_ops;
2021}
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002022
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -07002023#ifdef CONFIG_XEN_DEBUG_FS
2024
2025static struct dentry *d_mmu_debug;
2026
2027static int __init xen_mmu_debugfs(void)
2028{
2029 struct dentry *d_xen = xen_init_debugfs();
2030
2031 if (d_xen == NULL)
2032 return -ENOMEM;
2033
2034 d_mmu_debug = debugfs_create_dir("mmu", d_xen);
2035
2036 debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
2037
2038 debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
2039 debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
2040 &mmu_stats.pgd_update_pinned);
2041 debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
2042 &mmu_stats.pgd_update_pinned);
2043
2044 debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
2045 debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
2046 &mmu_stats.pud_update_pinned);
2047 debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
2048 &mmu_stats.pud_update_pinned);
2049
2050 debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
2051 debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
2052 &mmu_stats.pmd_update_pinned);
2053 debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
2054 &mmu_stats.pmd_update_pinned);
2055
2056 debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
2057// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
2058// &mmu_stats.pte_update_pinned);
2059 debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
2060 &mmu_stats.pte_update_pinned);
2061
2062 debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
2063 debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
2064 &mmu_stats.mmu_update_extended);
2065 xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
2066 mmu_stats.mmu_update_histo, 20);
2067
2068 debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
2069 debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
2070 &mmu_stats.set_pte_at_batched);
2071 debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
2072 &mmu_stats.set_pte_at_current);
2073 debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
2074 &mmu_stats.set_pte_at_kernel);
2075
2076 debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
2077 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
2078 &mmu_stats.prot_commit_batched);
2079
2080 return 0;
2081}
2082fs_initcall(xen_mmu_debugfs);
2083
2084#endif /* CONFIG_XEN_DEBUG_FS */