blob: e95955968ba3374572423498d2ab416fd1629874 [file] [log] [blame]
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001/*
2 * Xen mmu operations
3 *
4 * This file contains the various mmu fetch and update operations.
5 * The most important job they must perform is the mapping between the
6 * domain's pfn and the overall machine mfns.
7 *
8 * Xen allows guests to directly update the pagetable, in a controlled
9 * fashion. In other words, the guest modifies the same pagetable
10 * that the CPU actually uses, which eliminates the overhead of having
11 * a separate shadow pagetable.
12 *
13 * In order to allow this, it falls on the guest domain to map its
14 * notion of a "physical" pfn - which is just a domain-local linear
15 * address - into a real "machine address" which the CPU's MMU can
16 * use.
17 *
18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19 * inserted directly into the pagetable. When creating a new
20 * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
21 * when reading the content back with __(pgd|pmd|pte)_val, it converts
22 * the mfn back into a pfn.
23 *
24 * The other constraint is that all pages which make up a pagetable
25 * must be mapped read-only in the guest. This prevents uncontrolled
26 * guest updates to the pagetable. Xen strictly enforces this, and
27 * will disallow any pagetable update which will end up mapping a
28 * pagetable page RW, and will disallow using any writable page as a
29 * pagetable.
30 *
31 * Naively, when loading %cr3 with the base of a new pagetable, Xen
32 * would need to validate the whole pagetable before going on.
33 * Naturally, this is quite slow. The solution is to "pin" a
34 * pagetable, which enforces all the constraints on the pagetable even
35 * when it is not actively in use. This menas that Xen can be assured
36 * that it is still valid when you do load it into %cr3, and doesn't
37 * need to revalidate it.
38 *
39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
40 */
Jeremy Fitzhardingef120f132007-07-17 18:37:06 -070041#include <linux/sched.h>
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070042#include <linux/highmem.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070043#include <linux/bug.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070044
45#include <asm/pgtable.h>
46#include <asm/tlbflush.h>
47#include <asm/mmu_context.h>
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070048#include <asm/paravirt.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070049
50#include <asm/xen/hypercall.h>
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070051#include <asm/xen/hypervisor.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070052
53#include <xen/page.h>
54#include <xen/interface/xen.h>
55
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070056#include "multicalls.h"
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070057#include "mmu.h"
58
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +010059#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
Jeremy Fitzhardingecf0923e2008-05-26 23:31:20 +010060#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +010061
Jeremy Fitzhardingecf0923e2008-05-26 23:31:20 +010062/* Placeholder for holes in the address space */
63static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE]
64 __attribute__((section(".data.page_aligned"))) =
65 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
66
67 /* Array of pointers to pages containing p2m entries */
68static unsigned long *p2m_top[TOP_ENTRIES]
69 __attribute__((section(".data.page_aligned"))) =
70 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +010071
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +010072/* Arrays of p2m arrays expressed in mfns used for save/restore */
73static unsigned long p2m_top_mfn[TOP_ENTRIES]
74 __attribute__((section(".bss.page_aligned")));
75
76static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
77 __attribute__((section(".bss.page_aligned")));
78
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +010079static inline unsigned p2m_top_index(unsigned long pfn)
80{
Jeremy Fitzhardinge8006ec32008-05-26 23:31:19 +010081 BUG_ON(pfn >= MAX_DOMAIN_PAGES);
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +010082 return pfn / P2M_ENTRIES_PER_PAGE;
83}
84
85static inline unsigned p2m_index(unsigned long pfn)
86{
87 return pfn % P2M_ENTRIES_PER_PAGE;
88}
89
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +010090/* Build the parallel p2m_top_mfn structures */
91void xen_setup_mfn_list_list(void)
92{
93 unsigned pfn, idx;
94
95 for(pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
96 unsigned topidx = p2m_top_index(pfn);
97
98 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
99 }
100
101 for(idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
102 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
103 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
104 }
105
106 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
107
108 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
109 virt_to_mfn(p2m_top_mfn_list);
110 HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
111}
112
113/* Set up p2m_top to point to the domain-builder provided p2m pages */
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100114void __init xen_build_dynamic_phys_to_machine(void)
115{
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100116 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
Jeremy Fitzhardinge8006ec32008-05-26 23:31:19 +0100117 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100118 unsigned pfn;
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100119
Jeremy Fitzhardinge8006ec32008-05-26 23:31:19 +0100120 for(pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100121 unsigned topidx = p2m_top_index(pfn);
122
123 p2m_top[topidx] = &mfn_list[pfn];
124 }
125}
126
127unsigned long get_phys_to_machine(unsigned long pfn)
128{
129 unsigned topidx, idx;
130
Jeremy Fitzhardinge8006ec32008-05-26 23:31:19 +0100131 if (unlikely(pfn >= MAX_DOMAIN_PAGES))
132 return INVALID_P2M_ENTRY;
133
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100134 topidx = p2m_top_index(pfn);
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100135 idx = p2m_index(pfn);
136 return p2m_top[topidx][idx];
137}
138
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100139static void alloc_p2m(unsigned long **pp, unsigned long *mfnp)
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100140{
141 unsigned long *p;
142 unsigned i;
143
144 p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
145 BUG_ON(p == NULL);
146
147 for(i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
148 p[i] = INVALID_P2M_ENTRY;
149
Jeremy Fitzhardingecf0923e2008-05-26 23:31:20 +0100150 if (cmpxchg(pp, p2m_missing, p) != p2m_missing)
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100151 free_page((unsigned long)p);
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100152 else
153 *mfnp = virt_to_mfn(p);
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100154}
155
156void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
157{
158 unsigned topidx, idx;
159
160 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
161 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
162 return;
163 }
164
Jeremy Fitzhardinge8006ec32008-05-26 23:31:19 +0100165 if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
166 BUG_ON(mfn != INVALID_P2M_ENTRY);
167 return;
168 }
169
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100170 topidx = p2m_top_index(pfn);
Jeremy Fitzhardingecf0923e2008-05-26 23:31:20 +0100171 if (p2m_top[topidx] == p2m_missing) {
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100172 /* no need to allocate a page to store an invalid entry */
173 if (mfn == INVALID_P2M_ENTRY)
174 return;
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100175 alloc_p2m(&p2m_top[topidx], &p2m_top_mfn[topidx]);
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100176 }
177
178 idx = p2m_index(pfn);
179 p2m_top[topidx][idx] = mfn;
180}
181
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700182xmaddr_t arbitrary_virt_to_machine(unsigned long address)
183{
Harvey Harrisonda7bfc52008-02-09 23:24:08 +0100184 unsigned int level;
Ingo Molnarf0646e42008-01-30 13:33:43 +0100185 pte_t *pte = lookup_address(address, &level);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700186 unsigned offset = address & PAGE_MASK;
187
188 BUG_ON(pte == NULL);
189
190 return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
191}
192
193void make_lowmem_page_readonly(void *vaddr)
194{
195 pte_t *pte, ptev;
196 unsigned long address = (unsigned long)vaddr;
Harvey Harrisonda7bfc52008-02-09 23:24:08 +0100197 unsigned int level;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700198
Ingo Molnarf0646e42008-01-30 13:33:43 +0100199 pte = lookup_address(address, &level);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700200 BUG_ON(pte == NULL);
201
202 ptev = pte_wrprotect(*pte);
203
204 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
205 BUG();
206}
207
208void make_lowmem_page_readwrite(void *vaddr)
209{
210 pte_t *pte, ptev;
211 unsigned long address = (unsigned long)vaddr;
Harvey Harrisonda7bfc52008-02-09 23:24:08 +0100212 unsigned int level;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700213
Ingo Molnarf0646e42008-01-30 13:33:43 +0100214 pte = lookup_address(address, &level);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700215 BUG_ON(pte == NULL);
216
217 ptev = pte_mkwrite(*pte);
218
219 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
220 BUG();
221}
222
223
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700224void xen_set_pmd(pmd_t *ptr, pmd_t val)
225{
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700226 struct multicall_space mcs;
227 struct mmu_update *u;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700228
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700229 preempt_disable();
230
231 mcs = xen_mc_entry(sizeof(*u));
232 u = mcs.args;
233 u->ptr = virt_to_machine(ptr).maddr;
234 u->val = pmd_val_ma(val);
235 MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
236
237 xen_mc_issue(PARAVIRT_LAZY_MMU);
238
239 preempt_enable();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700240}
241
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700242/*
243 * Associate a virtual page frame with a given physical page frame
244 * and protection flags for that frame.
245 */
246void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
247{
248 pgd_t *pgd;
249 pud_t *pud;
250 pmd_t *pmd;
251 pte_t *pte;
252
253 pgd = swapper_pg_dir + pgd_index(vaddr);
254 if (pgd_none(*pgd)) {
255 BUG();
256 return;
257 }
258 pud = pud_offset(pgd, vaddr);
259 if (pud_none(*pud)) {
260 BUG();
261 return;
262 }
263 pmd = pmd_offset(pud, vaddr);
264 if (pmd_none(*pmd)) {
265 BUG();
266 return;
267 }
268 pte = pte_offset_kernel(pmd, vaddr);
269 /* <mfn,flags> stored as-is, to permit clearing entries */
270 xen_set_pte(pte, mfn_pte(mfn, flags));
271
272 /*
273 * It's enough to flush this one mapping.
274 * (PGE mappings get flushed as well)
275 */
276 __flush_tlb_one(vaddr);
277}
278
279void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
280 pte_t *ptep, pte_t pteval)
281{
Jeremy Fitzhardinge2bd50032008-04-02 10:54:10 -0700282 /* updates to init_mm may be done without lock */
283 if (mm == &init_mm)
284 preempt_disable();
285
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700286 if (mm == current->mm || mm == &init_mm) {
Jeremy Fitzhardinge8965c1c2007-10-16 11:51:29 -0700287 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700288 struct multicall_space mcs;
289 mcs = xen_mc_entry(0);
290
291 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
292 xen_mc_issue(PARAVIRT_LAZY_MMU);
Jeremy Fitzhardinge2bd50032008-04-02 10:54:10 -0700293 goto out;
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700294 } else
295 if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
Jeremy Fitzhardinge2bd50032008-04-02 10:54:10 -0700296 goto out;
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700297 }
298 xen_set_pte(ptep, pteval);
Jeremy Fitzhardinge2bd50032008-04-02 10:54:10 -0700299
300out:
301 if (mm == &init_mm)
302 preempt_enable();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700303}
304
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700305pteval_t xen_pte_val(pte_t pte)
306{
307 pteval_t ret = pte.pte;
308
309 if (ret & _PAGE_PRESENT)
310 ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
311
312 return ret;
313}
314
315pgdval_t xen_pgd_val(pgd_t pgd)
316{
317 pgdval_t ret = pgd.pgd;
318 if (ret & _PAGE_PRESENT)
319 ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
320 return ret;
321}
322
323pte_t xen_make_pte(pteval_t pte)
324{
325 if (pte & _PAGE_PRESENT) {
326 pte = phys_to_machine(XPADDR(pte)).maddr;
327 pte &= ~(_PAGE_PCD | _PAGE_PWT);
328 }
329
330 return (pte_t){ .pte = pte };
331}
332
333pgd_t xen_make_pgd(pgdval_t pgd)
334{
335 if (pgd & _PAGE_PRESENT)
336 pgd = phys_to_machine(XPADDR(pgd)).maddr;
337
338 return (pgd_t){ pgd };
339}
340
341pmdval_t xen_pmd_val(pmd_t pmd)
342{
343 pmdval_t ret = native_pmd_val(pmd);
344 if (ret & _PAGE_PRESENT)
345 ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
346 return ret;
347}
Jeremy Fitzhardinge3843fc22008-05-09 12:05:57 +0100348
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700349void xen_set_pud(pud_t *ptr, pud_t val)
350{
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700351 struct multicall_space mcs;
352 struct mmu_update *u;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700353
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700354 preempt_disable();
355
356 mcs = xen_mc_entry(sizeof(*u));
357 u = mcs.args;
358 u->ptr = virt_to_machine(ptr).maddr;
359 u->val = pud_val_ma(val);
360 MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
361
362 xen_mc_issue(PARAVIRT_LAZY_MMU);
363
364 preempt_enable();
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700365}
366
367void xen_set_pte(pte_t *ptep, pte_t pte)
368{
369 ptep->pte_high = pte.pte_high;
370 smp_wmb();
371 ptep->pte_low = pte.pte_low;
372}
373
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700374void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
375{
376 set_64bit((u64 *)ptep, pte_val_ma(pte));
377}
378
379void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
380{
381 ptep->pte_low = 0;
382 smp_wmb(); /* make sure low gets written first */
383 ptep->pte_high = 0;
384}
385
386void xen_pmd_clear(pmd_t *pmdp)
387{
388 xen_set_pmd(pmdp, __pmd(0));
389}
390
Jeremy Fitzhardingeabf33032008-03-17 16:37:07 -0700391pmd_t xen_make_pmd(pmdval_t pmd)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700392{
Jeremy Fitzhardinge430442e2008-03-17 16:37:08 -0700393 if (pmd & _PAGE_PRESENT)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700394 pmd = phys_to_machine(XPADDR(pmd)).maddr;
395
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700396 return native_make_pmd(pmd);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700397}
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700398
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700399/*
400 (Yet another) pagetable walker. This one is intended for pinning a
401 pagetable. This means that it walks a pagetable and calls the
402 callback function on each page it finds making up the page table,
403 at every level. It walks the entire pagetable, but it only bothers
404 pinning pte pages which are below pte_limit. In the normal case
405 this will be TASK_SIZE, but at boot we need to pin up to
406 FIXADDR_TOP. But the important bit is that we don't pin beyond
407 there, because then we start getting into Xen's ptes.
408*/
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700409static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700410 unsigned long limit)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700411{
412 pgd_t *pgd = pgd_base;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700413 int flush = 0;
414 unsigned long addr = 0;
415 unsigned long pgd_next;
416
417 BUG_ON(limit > FIXADDR_TOP);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700418
419 if (xen_feature(XENFEAT_auto_translated_physmap))
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700420 return 0;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700421
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700422 for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
423 pud_t *pud;
424 unsigned long pud_limit, pud_next;
425
426 pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
427
428 if (!pgd_val(*pgd))
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700429 continue;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700430
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700431 pud = pud_offset(pgd, 0);
432
433 if (PTRS_PER_PUD > 1) /* not folded */
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700434 flush |= (*func)(virt_to_page(pud), PT_PUD);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700435
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700436 for (; addr != pud_limit; pud++, addr = pud_next) {
437 pmd_t *pmd;
438 unsigned long pmd_limit;
439
440 pud_next = pud_addr_end(addr, pud_limit);
441
442 if (pud_next < limit)
443 pmd_limit = pud_next;
444 else
445 pmd_limit = limit;
446
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700447 if (pud_none(*pud))
448 continue;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700449
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700450 pmd = pmd_offset(pud, 0);
451
452 if (PTRS_PER_PMD > 1) /* not folded */
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700453 flush |= (*func)(virt_to_page(pmd), PT_PMD);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700454
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700455 for (; addr != pmd_limit; pmd++) {
456 addr += (PAGE_SIZE * PTRS_PER_PTE);
457 if ((pmd_limit-1) < (addr-1)) {
458 addr = pmd_limit;
459 break;
460 }
461
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700462 if (pmd_none(*pmd))
463 continue;
464
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700465 flush |= (*func)(pmd_page(*pmd), PT_PTE);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700466 }
467 }
468 }
469
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700470 flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700471
472 return flush;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700473}
474
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700475static spinlock_t *lock_pte(struct page *page)
476{
477 spinlock_t *ptl = NULL;
478
479#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
480 ptl = __pte_lockptr(page);
481 spin_lock(ptl);
482#endif
483
484 return ptl;
485}
486
487static void do_unlock(void *v)
488{
489 spinlock_t *ptl = v;
490 spin_unlock(ptl);
491}
492
493static void xen_do_pin(unsigned level, unsigned long pfn)
494{
495 struct mmuext_op *op;
496 struct multicall_space mcs;
497
498 mcs = __xen_mc_entry(sizeof(*op));
499 op = mcs.args;
500 op->cmd = level;
501 op->arg1.mfn = pfn_to_mfn(pfn);
502 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
503}
504
505static int pin_page(struct page *page, enum pt_level level)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700506{
Christoph Lameterd60cd462008-04-28 02:12:51 -0700507 unsigned pgfl = TestSetPagePinned(page);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700508 int flush;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700509
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700510 if (pgfl)
511 flush = 0; /* already pinned */
512 else if (PageHighMem(page))
513 /* kmaps need flushing if we found an unpinned
514 highpage */
515 flush = 1;
516 else {
517 void *pt = lowmem_page_address(page);
518 unsigned long pfn = page_to_pfn(page);
519 struct multicall_space mcs = __xen_mc_entry(0);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700520 spinlock_t *ptl;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700521
522 flush = 0;
523
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700524 ptl = NULL;
525 if (level == PT_PTE)
526 ptl = lock_pte(page);
527
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700528 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
529 pfn_pte(pfn, PAGE_KERNEL_RO),
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700530 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
531
532 if (level == PT_PTE)
533 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
534
535 if (ptl) {
536 /* Queue a deferred unlock for when this batch
537 is completed. */
538 xen_mc_callback(do_unlock, ptl);
539 }
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700540 }
541
542 return flush;
543}
544
545/* This is called just after a mm has been created, but it has not
546 been used yet. We need to make sure that its pagetable is all
547 read-only, and can be pinned. */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700548void xen_pgd_pin(pgd_t *pgd)
549{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700550 xen_mc_batch();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700551
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -0700552 if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
553 /* re-enable interrupts for kmap_flush_unused */
554 xen_mc_issue(0);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700555 kmap_flush_unused();
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -0700556 xen_mc_batch();
557 }
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700558
Jeremy Fitzhardinge3843fc22008-05-09 12:05:57 +0100559 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700560 xen_mc_issue(0);
561}
562
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +0100563/*
564 * On save, we need to pin all pagetables to make sure they get their
565 * mfns turned into pfns. Search the list for any unpinned pgds and pin
566 * them (unpinned pgds are not currently in use, probably because the
567 * process is under construction or destruction).
568 */
569void xen_mm_pin_all(void)
570{
571 unsigned long flags;
572 struct page *page;
573
574 spin_lock_irqsave(&pgd_lock, flags);
575
576 list_for_each_entry(page, &pgd_list, lru) {
577 if (!PagePinned(page)) {
578 xen_pgd_pin((pgd_t *)page_address(page));
579 SetPageSavePinned(page);
580 }
581 }
582
583 spin_unlock_irqrestore(&pgd_lock, flags);
584}
585
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700586/* The init_mm pagetable is really pinned as soon as its created, but
587 that's before we have page structures to store the bits. So do all
588 the book-keeping now. */
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700589static __init int mark_pinned(struct page *page, enum pt_level level)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700590{
591 SetPagePinned(page);
592 return 0;
593}
594
595void __init xen_mark_init_mm_pinned(void)
596{
597 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
598}
599
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700600static int unpin_page(struct page *page, enum pt_level level)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700601{
Christoph Lameterd60cd462008-04-28 02:12:51 -0700602 unsigned pgfl = TestClearPagePinned(page);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700603
604 if (pgfl && !PageHighMem(page)) {
605 void *pt = lowmem_page_address(page);
606 unsigned long pfn = page_to_pfn(page);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700607 spinlock_t *ptl = NULL;
608 struct multicall_space mcs;
609
610 if (level == PT_PTE) {
611 ptl = lock_pte(page);
612
613 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
614 }
615
616 mcs = __xen_mc_entry(0);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700617
618 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
619 pfn_pte(pfn, PAGE_KERNEL),
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700620 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
621
622 if (ptl) {
623 /* unlock when batch completed */
624 xen_mc_callback(do_unlock, ptl);
625 }
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700626 }
627
628 return 0; /* never need to flush on unpin */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700629}
630
631/* Release a pagetables pages back as normal RW */
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700632static void xen_pgd_unpin(pgd_t *pgd)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700633{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700634 xen_mc_batch();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700635
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700636 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700637
638 pgd_walk(pgd, unpin_page, TASK_SIZE);
639
640 xen_mc_issue(0);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700641}
642
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +0100643/*
644 * On resume, undo any pinning done at save, so that the rest of the
645 * kernel doesn't see any unexpected pinned pagetables.
646 */
647void xen_mm_unpin_all(void)
648{
649 unsigned long flags;
650 struct page *page;
651
652 spin_lock_irqsave(&pgd_lock, flags);
653
654 list_for_each_entry(page, &pgd_list, lru) {
655 if (PageSavePinned(page)) {
656 BUG_ON(!PagePinned(page));
657 printk("unpinning pinned %p\n", page_address(page));
658 xen_pgd_unpin((pgd_t *)page_address(page));
659 ClearPageSavePinned(page);
660 }
661 }
662
663 spin_unlock_irqrestore(&pgd_lock, flags);
664}
665
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700666void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
667{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700668 spin_lock(&next->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700669 xen_pgd_pin(next->pgd);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700670 spin_unlock(&next->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700671}
672
673void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
674{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700675 spin_lock(&mm->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700676 xen_pgd_pin(mm->pgd);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700677 spin_unlock(&mm->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700678}
679
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -0700680
681#ifdef CONFIG_SMP
682/* Another cpu may still have their %cr3 pointing at the pagetable, so
683 we need to repoint it somewhere else before we can unpin it. */
684static void drop_other_mm_ref(void *info)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700685{
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -0700686 struct mm_struct *mm = info;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700687
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -0700688 if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
689 leave_mm(smp_processor_id());
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -0700690
691 /* If this cpu still has a stale cr3 reference, then make sure
692 it has been flushed. */
693 if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
694 load_cr3(swapper_pg_dir);
695 arch_flush_lazy_cpu_mode();
696 }
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -0700697}
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700698
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -0700699static void drop_mm_ref(struct mm_struct *mm)
700{
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -0700701 cpumask_t mask;
702 unsigned cpu;
703
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -0700704 if (current->active_mm == mm) {
705 if (current->mm == mm)
706 load_cr3(swapper_pg_dir);
707 else
708 leave_mm(smp_processor_id());
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -0700709 arch_flush_lazy_cpu_mode();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700710 }
711
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -0700712 /* Get the "official" set of cpus referring to our pagetable. */
713 mask = mm->cpu_vm_mask;
714
715 /* It's possible that a vcpu may have a stale reference to our
716 cr3, because its in lazy mode, and it hasn't yet flushed
717 its set of pending hypercalls yet. In this case, we can
718 look at its actual current cr3 value, and force it to flush
719 if needed. */
720 for_each_online_cpu(cpu) {
721 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
722 cpu_set(cpu, mask);
723 }
724
725 if (!cpus_empty(mask))
726 xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -0700727}
728#else
729static void drop_mm_ref(struct mm_struct *mm)
730{
731 if (current->active_mm == mm)
732 load_cr3(swapper_pg_dir);
733}
734#endif
735
736/*
737 * While a process runs, Xen pins its pagetables, which means that the
738 * hypervisor forces it to be read-only, and it controls all updates
739 * to it. This means that all pagetable updates have to go via the
740 * hypervisor, which is moderately expensive.
741 *
742 * Since we're pulling the pagetable down, we switch to use init_mm,
743 * unpin old process pagetable and mark it all read-write, which
744 * allows further operations on it to be simple memory accesses.
745 *
746 * The only subtle point is that another CPU may be still using the
747 * pagetable because of lazy tlb flushing. This means we need need to
748 * switch all CPUs off this pagetable before we can unpin it.
749 */
750void xen_exit_mmap(struct mm_struct *mm)
751{
752 get_cpu(); /* make sure we don't move around */
753 drop_mm_ref(mm);
754 put_cpu();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700755
Jeremy Fitzhardingef120f132007-07-17 18:37:06 -0700756 spin_lock(&mm->page_table_lock);
Jeremy Fitzhardingedf912ea2007-09-25 11:50:00 -0700757
758 /* pgd may not be pinned in the error exit path of execve */
759 if (PagePinned(virt_to_page(mm->pgd)))
760 xen_pgd_unpin(mm->pgd);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -0700761
Jeremy Fitzhardingef120f132007-07-17 18:37:06 -0700762 spin_unlock(&mm->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700763}