blob: b9651343723665b869a6e0995908d39f6f39b510 [file] [log] [blame]
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001/*
2 * Xen mmu operations
3 *
4 * This file contains the various mmu fetch and update operations.
5 * The most important job they must perform is the mapping between the
6 * domain's pfn and the overall machine mfns.
7 *
8 * Xen allows guests to directly update the pagetable, in a controlled
9 * fashion. In other words, the guest modifies the same pagetable
10 * that the CPU actually uses, which eliminates the overhead of having
11 * a separate shadow pagetable.
12 *
13 * In order to allow this, it falls on the guest domain to map its
14 * notion of a "physical" pfn - which is just a domain-local linear
15 * address - into a real "machine address" which the CPU's MMU can
16 * use.
17 *
18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19 * inserted directly into the pagetable. When creating a new
20 * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
21 * when reading the content back with __(pgd|pmd|pte)_val, it converts
22 * the mfn back into a pfn.
23 *
24 * The other constraint is that all pages which make up a pagetable
25 * must be mapped read-only in the guest. This prevents uncontrolled
26 * guest updates to the pagetable. Xen strictly enforces this, and
27 * will disallow any pagetable update which will end up mapping a
28 * pagetable page RW, and will disallow using any writable page as a
29 * pagetable.
30 *
31 * Naively, when loading %cr3 with the base of a new pagetable, Xen
32 * would need to validate the whole pagetable before going on.
33 * Naturally, this is quite slow. The solution is to "pin" a
34 * pagetable, which enforces all the constraints on the pagetable even
35 * when it is not actively in use. This menas that Xen can be assured
36 * that it is still valid when you do load it into %cr3, and doesn't
37 * need to revalidate it.
38 *
39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
40 */
Jeremy Fitzhardingef120f132007-07-17 18:37:06 -070041#include <linux/sched.h>
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070042#include <linux/highmem.h>
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -070043#include <linux/debugfs.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070044#include <linux/bug.h>
Jeremy Fitzhardinged2cb2142010-03-26 15:37:50 -070045#include <linux/vmalloc.h>
Randy Dunlap44408ad2009-05-12 13:31:40 -070046#include <linux/module.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090047#include <linux/gfp.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070048
49#include <asm/pgtable.h>
50#include <asm/tlbflush.h>
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -070051#include <asm/fixmap.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070052#include <asm/mmu_context.h>
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -080053#include <asm/setup.h>
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070054#include <asm/paravirt.h>
Alex Nixon7347b402010-02-19 13:31:06 -050055#include <asm/e820.h>
Jeremy Fitzhardingecbcd79c2008-07-08 15:06:27 -070056#include <asm/linkage.h>
Alex Nixon08bbc9d2009-02-09 12:05:46 -080057#include <asm/page.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070058
59#include <asm/xen/hypercall.h>
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070060#include <asm/xen/hypervisor.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070061
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -080062#include <xen/xen.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070063#include <xen/page.h>
64#include <xen/interface/xen.h>
Stefano Stabellini59151002010-06-17 14:22:52 +010065#include <xen/interface/hvm/hvm_op.h>
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -080066#include <xen/interface/version.h>
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -080067#include <xen/interface/memory.h>
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -080068#include <xen/hvc-console.h>
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070069
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -070070#include "multicalls.h"
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -070071#include "mmu.h"
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -070072#include "debugfs.h"
73
74#define MMU_UPDATE_HISTO 30
75
Alex Nixon19001c82009-02-09 12:05:46 -080076/*
77 * Protects atomic reservation decrease/increase against concurrent increases.
78 * Also protects non-atomic updates of current_pages and driver_pages, and
79 * balloon lists.
80 */
81DEFINE_SPINLOCK(xen_reservation_lock);
82
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -070083#ifdef CONFIG_XEN_DEBUG_FS
84
85static struct {
86 u32 pgd_update;
87 u32 pgd_update_pinned;
88 u32 pgd_update_batched;
89
90 u32 pud_update;
91 u32 pud_update_pinned;
92 u32 pud_update_batched;
93
94 u32 pmd_update;
95 u32 pmd_update_pinned;
96 u32 pmd_update_batched;
97
98 u32 pte_update;
99 u32 pte_update_pinned;
100 u32 pte_update_batched;
101
102 u32 mmu_update;
103 u32 mmu_update_extended;
104 u32 mmu_update_histo[MMU_UPDATE_HISTO];
105
106 u32 prot_commit;
107 u32 prot_commit_batched;
108
109 u32 set_pte_at;
110 u32 set_pte_at_batched;
111 u32 set_pte_at_pinned;
112 u32 set_pte_at_current;
113 u32 set_pte_at_kernel;
114} mmu_stats;
115
116static u8 zero_stats;
117
118static inline void check_zero(void)
119{
120 if (unlikely(zero_stats)) {
121 memset(&mmu_stats, 0, sizeof(mmu_stats));
122 zero_stats = 0;
123 }
124}
125
126#define ADD_STATS(elem, val) \
127 do { check_zero(); mmu_stats.elem += (val); } while(0)
128
129#else /* !CONFIG_XEN_DEBUG_FS */
130
131#define ADD_STATS(elem, val) do { (void)(val); } while(0)
132
133#endif /* CONFIG_XEN_DEBUG_FS */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700134
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -0800135
136/*
137 * Identity map, in addition to plain kernel map. This needs to be
138 * large enough to allocate page table pages to allocate the rest.
139 * Each page can map 2MB.
140 */
Jeremy Fitzhardinge764f01382010-08-26 16:23:51 -0700141#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4)
142static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -0800143
144#ifdef CONFIG_X86_64
145/* l3 pud for userspace vsyscall mapping */
146static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
147#endif /* CONFIG_X86_64 */
148
149/*
150 * Note about cr3 (pagetable base) values:
151 *
152 * xen_cr3 contains the current logical cr3 value; it contains the
153 * last set cr3. This may not be the current effective cr3, because
154 * its update may be being lazily deferred. However, a vcpu looking
155 * at its own cr3 can use this value knowing that it everything will
156 * be self-consistent.
157 *
158 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
159 * hypercall to set the vcpu cr3 is complete (so it may be a little
160 * out of date, but it will never be set early). If one vcpu is
161 * looking at another vcpu's cr3 value, it should use this variable.
162 */
163DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
164DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
165
166
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700167/*
168 * Just beyond the highest usermode address. STACK_TOP_MAX has a
169 * redzone above it, so round it up to a PGD boundary.
170 */
171#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
172
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700173/*
174 * Xen leaves the responsibility for maintaining p2m mappings to the
175 * guests themselves, but it must also access and update the p2m array
176 * during suspend/resume when all the pages are reallocated.
177 *
178 * The p2m table is logically a flat array, but we implement it as a
179 * three-level tree to allow the address space to be sparse.
180 *
181 * Xen
182 * |
183 * p2m_top p2m_top_mfn
184 * / \ / \
185 * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
186 * / \ / \ / /
187 * p2m p2m p2m p2m p2m p2m p2m ...
188 *
189 * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
190 * maximum representable pseudo-physical address space is:
191 * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
192 *
193 * P2M_PER_PAGE depends on the architecture, as a mfn is always
194 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
195 * 512 and 1024 entries respectively.
196 */
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700197
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700198static unsigned long max_p2m_pfn __read_mostly;
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100199
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700200#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
201#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
202#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
Jeremy Fitzhardingecf0923e2008-05-26 23:31:20 +0100203
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700204#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100205
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700206/* Placeholders for holes in the address space */
207static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
208static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
209static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100210
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700211static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
212static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
213
214RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
215RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100216
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100217static inline unsigned p2m_top_index(unsigned long pfn)
218{
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700219 BUG_ON(pfn >= MAX_P2M_PFN);
220 return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
221}
222
223static inline unsigned p2m_mid_index(unsigned long pfn)
224{
225 return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100226}
227
228static inline unsigned p2m_index(unsigned long pfn)
229{
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700230 return pfn % P2M_PER_PAGE;
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100231}
232
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700233static void p2m_top_init(unsigned long ***top)
234{
235 unsigned i;
236
237 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
238 top[i] = p2m_mid_missing;
239}
240
241static void p2m_top_mfn_init(unsigned long *top)
242{
243 unsigned i;
244
245 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
246 top[i] = virt_to_mfn(p2m_mid_missing_mfn);
247}
248
249static void p2m_mid_init(unsigned long **mid)
250{
251 unsigned i;
252
253 for (i = 0; i < P2M_MID_PER_PAGE; i++)
254 mid[i] = p2m_missing;
255}
256
257static void p2m_mid_mfn_init(unsigned long *mid)
258{
259 unsigned i;
260
261 for (i = 0; i < P2M_MID_PER_PAGE; i++)
262 mid[i] = virt_to_mfn(p2m_missing);
263}
264
265static void p2m_init(unsigned long *p2m)
266{
267 unsigned i;
268
269 for (i = 0; i < P2M_MID_PER_PAGE; i++)
270 p2m[i] = INVALID_P2M_ENTRY;
271}
272
273/*
274 * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
275 *
276 * This is called both at boot time, and after resuming from suspend:
277 * - At boot time we're called very early, and must use extend_brk()
278 * to allocate memory.
279 *
280 * - After resume we're called from within stop_machine, but the mfn
281 * tree should alreay be completely allocated.
282 */
Ian Campbellfa24ba62009-11-21 11:32:49 +0000283void xen_build_mfn_list_list(void)
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100284{
Jeremy Fitzhardingec3798062010-08-27 13:42:04 -0700285 unsigned pfn;
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100286
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700287 /* Pre-initialize p2m_top_mfn to be completely missing */
288 if (p2m_top_mfn == NULL) {
289 p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
290 p2m_mid_mfn_init(p2m_mid_missing_mfn);
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100291
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700292 p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
293 p2m_top_mfn_init(p2m_top_mfn);
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100294 }
295
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700296 for (pfn = 0; pfn < max_p2m_pfn; pfn += P2M_PER_PAGE) {
297 unsigned topidx = p2m_top_index(pfn);
298 unsigned mididx = p2m_mid_index(pfn);
299 unsigned long **mid;
300 unsigned long mid_mfn;
301 unsigned long *mid_mfn_p;
302
303 mid = p2m_top[topidx];
304
305 /* Don't bother allocating any mfn mid levels if
306 they're just missing */
307 if (mid[mididx] == p2m_missing)
308 continue;
309
310 mid_mfn = p2m_top_mfn[topidx];
311 mid_mfn_p = mfn_to_virt(mid_mfn);
312
313 if (mid_mfn_p == p2m_mid_missing_mfn) {
314 /*
315 * XXX boot-time only! We should never find
316 * missing parts of the mfn tree after
317 * runtime. extend_brk() will BUG if we call
318 * it too late.
319 */
320 mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
321 p2m_mid_mfn_init(mid_mfn_p);
322
323 mid_mfn = virt_to_mfn(mid_mfn_p);
324
325 p2m_top_mfn[topidx] = mid_mfn;
326 }
327
328 mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100329 }
Jeremy Fitzhardingecdaead62009-02-27 15:34:59 -0800330}
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100331
Jeremy Fitzhardingecdaead62009-02-27 15:34:59 -0800332void xen_setup_mfn_list_list(void)
333{
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100334 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
335
336 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700337 virt_to_mfn(p2m_top_mfn);
Jeremy Fitzhardinge1f2d9dd2010-08-26 17:11:35 -0700338 HYPERVISOR_shared_info->arch.max_pfn = max_p2m_pfn;
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100339}
340
341/* Set up p2m_top to point to the domain-builder provided p2m pages */
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100342void __init xen_build_dynamic_phys_to_machine(void)
343{
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100344 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
Jeremy Fitzhardinge8006ec32008-05-26 23:31:19 +0100345 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
Jeremy Fitzhardinged5edbc12008-05-26 23:31:22 +0100346 unsigned pfn;
Jeremy Fitzhardingea171ce62010-08-26 15:04:48 -0700347
Jeremy Fitzhardingea2e87522010-08-26 16:08:31 -0700348 max_p2m_pfn = max_pfn;
349
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700350 p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
351 p2m_init(p2m_missing);
Jeremy Fitzhardingea171ce62010-08-26 15:04:48 -0700352
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700353 p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
354 p2m_mid_init(p2m_mid_missing);
Jeremy Fitzhardingea171ce62010-08-26 15:04:48 -0700355
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700356 p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
357 p2m_top_init(p2m_top);
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100358
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700359 /*
360 * The domain builder gives us a pre-constructed p2m array in
361 * mfn_list for all the pages initially given to us, so we just
362 * need to graft that into our tree structure.
363 */
364 for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100365 unsigned topidx = p2m_top_index(pfn);
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700366 unsigned mididx = p2m_mid_index(pfn);
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100367
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700368 if (p2m_top[topidx] == p2m_mid_missing) {
369 unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
370 p2m_mid_init(mid);
371
372 p2m_top[topidx] = mid;
373 }
374
375 p2m_top[topidx][mididx] = &mfn_list[pfn];
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100376 }
Jeremy Fitzhardingecdaead62009-02-27 15:34:59 -0800377
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700378 /* Allocate and initialize top and mid mfn levels */
Jeremy Fitzhardingecdaead62009-02-27 15:34:59 -0800379 xen_build_mfn_list_list();
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100380}
381
382unsigned long get_phys_to_machine(unsigned long pfn)
383{
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700384 unsigned topidx, mididx, idx;
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100385
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700386 if (unlikely(pfn >= MAX_P2M_PFN))
Jeremy Fitzhardinge8006ec32008-05-26 23:31:19 +0100387 return INVALID_P2M_ENTRY;
388
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100389 topidx = p2m_top_index(pfn);
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700390 mididx = p2m_mid_index(pfn);
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100391 idx = p2m_index(pfn);
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700392
393 return p2m_top[topidx][mididx][idx];
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100394}
Ingo Molnar15ce60052008-06-02 13:20:11 +0200395EXPORT_SYMBOL_GPL(get_phys_to_machine);
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100396
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700397static void *alloc_p2m_page(void)
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100398{
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700399 return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
Jeremy Fitzhardingee791ca02009-02-26 15:48:33 -0800400}
401
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700402static void free_p2m_page(void *p)
Jeremy Fitzhardingee791ca02009-02-26 15:48:33 -0800403{
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700404 free_page((unsigned long)p);
405}
Jeremy Fitzhardingee791ca02009-02-26 15:48:33 -0800406
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700407/*
408 * Fully allocate the p2m structure for a given pfn. We need to check
409 * that both the top and mid levels are allocated, and make sure the
410 * parallel mfn tree is kept in sync. We may race with other cpus, so
411 * the new pages are installed with cmpxchg; if we lose the race then
412 * simply free the page we allocated and use the one that's there.
413 */
414static bool alloc_p2m(unsigned long pfn)
415{
416 unsigned topidx, mididx;
417 unsigned long ***top_p, **mid;
418 unsigned long *top_mfn_p, *mid_mfn;
Jeremy Fitzhardingee791ca02009-02-26 15:48:33 -0800419
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700420 topidx = p2m_top_index(pfn);
421 mididx = p2m_mid_index(pfn);
422
423 top_p = &p2m_top[topidx];
424 mid = *top_p;
425
426 if (mid == p2m_mid_missing) {
427 /* Mid level is missing, allocate a new one */
428 mid = alloc_p2m_page();
429 if (!mid)
430 return false;
431
432 p2m_mid_init(mid);
433
434 if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
435 free_p2m_page(mid);
436 }
437
438 top_mfn_p = &p2m_top_mfn[topidx];
439 mid_mfn = mfn_to_virt(*top_mfn_p);
440
441 if (mid_mfn == p2m_mid_missing_mfn) {
442 /* Separately check the mid mfn level */
443 unsigned long missing_mfn;
444 unsigned long mid_mfn_mfn;
445
446 mid_mfn = alloc_p2m_page();
447 if (!mid_mfn)
448 return false;
449
450 p2m_mid_mfn_init(mid_mfn);
451
452 missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
453 mid_mfn_mfn = virt_to_mfn(mid_mfn);
454 if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
455 free_p2m_page(mid_mfn);
456 }
457
458 if (p2m_top[topidx][mididx] == p2m_missing) {
459 /* p2m leaf page is missing */
460 unsigned long *p2m;
461
462 p2m = alloc_p2m_page();
463 if (!p2m)
464 return false;
465
466 p2m_init(p2m);
467
468 if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
469 free_p2m_page(p2m);
470 else
471 mid_mfn[mididx] = virt_to_mfn(p2m);
472 }
473
474 return true;
Jeremy Fitzhardingee791ca02009-02-26 15:48:33 -0800475}
476
477/* Try to install p2m mapping; fail if intermediate bits missing */
478bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
479{
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700480 unsigned topidx, mididx, idx;
Jeremy Fitzhardingee791ca02009-02-26 15:48:33 -0800481
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700482 if (unlikely(pfn >= MAX_P2M_PFN)) {
Jeremy Fitzhardingee791ca02009-02-26 15:48:33 -0800483 BUG_ON(mfn != INVALID_P2M_ENTRY);
484 return true;
485 }
486
487 topidx = p2m_top_index(pfn);
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700488 mididx = p2m_mid_index(pfn);
Jeremy Fitzhardingee791ca02009-02-26 15:48:33 -0800489 idx = p2m_index(pfn);
Jeremy Fitzhardinge58e05022010-08-27 13:28:48 -0700490
491 if (p2m_top[topidx][mididx] == p2m_missing)
492 return mfn == INVALID_P2M_ENTRY;
493
494 p2m_top[topidx][mididx][idx] = mfn;
Jeremy Fitzhardingee791ca02009-02-26 15:48:33 -0800495
496 return true;
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100497}
498
Jeremy Fitzhardingec3798062010-08-27 13:42:04 -0700499bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100500{
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100501 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
502 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
Jeremy Fitzhardingec3798062010-08-27 13:42:04 -0700503 return true;
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100504 }
505
Jeremy Fitzhardingee791ca02009-02-26 15:48:33 -0800506 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
Jeremy Fitzhardingec3798062010-08-27 13:42:04 -0700507 if (!alloc_p2m(pfn))
508 return false;
Jeremy Fitzhardinge8006ec32008-05-26 23:31:19 +0100509
Jeremy Fitzhardingee791ca02009-02-26 15:48:33 -0800510 if (!__set_phys_to_machine(pfn, mfn))
Jeremy Fitzhardingec3798062010-08-27 13:42:04 -0700511 return false;
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100512 }
Jeremy Fitzhardingec3798062010-08-27 13:42:04 -0700513
514 return true;
Jeremy Fitzhardinged451bb72008-05-26 23:31:18 +0100515}
516
Jeremy Fitzhardinge9976b392009-02-27 09:19:26 -0800517unsigned long arbitrary_virt_to_mfn(void *vaddr)
518{
519 xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
520
521 return PFN_DOWN(maddr.maddr);
522}
523
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700524xmaddr_t arbitrary_virt_to_machine(void *vaddr)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700525{
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700526 unsigned long address = (unsigned long)vaddr;
Harvey Harrisonda7bfc52008-02-09 23:24:08 +0100527 unsigned int level;
Chris Lalancette9f32d212008-10-23 17:40:25 -0700528 pte_t *pte;
529 unsigned offset;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700530
Chris Lalancette9f32d212008-10-23 17:40:25 -0700531 /*
532 * if the PFN is in the linear mapped vaddr range, we can just use
533 * the (quick) virt_to_machine() p2m lookup
534 */
535 if (virt_addr_valid(vaddr))
536 return virt_to_machine(vaddr);
537
538 /* otherwise we have to do a (slower) full page-table walk */
539
540 pte = lookup_address(address, &level);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700541 BUG_ON(pte == NULL);
Chris Lalancette9f32d212008-10-23 17:40:25 -0700542 offset = address & ~PAGE_MASK;
Jeremy Fitzhardingeebd879e2008-07-08 15:06:54 -0700543 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700544}
545
546void make_lowmem_page_readonly(void *vaddr)
547{
548 pte_t *pte, ptev;
549 unsigned long address = (unsigned long)vaddr;
Harvey Harrisonda7bfc52008-02-09 23:24:08 +0100550 unsigned int level;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700551
Ingo Molnarf0646e42008-01-30 13:33:43 +0100552 pte = lookup_address(address, &level);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700553 BUG_ON(pte == NULL);
554
555 ptev = pte_wrprotect(*pte);
556
557 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
558 BUG();
559}
560
561void make_lowmem_page_readwrite(void *vaddr)
562{
563 pte_t *pte, ptev;
564 unsigned long address = (unsigned long)vaddr;
Harvey Harrisonda7bfc52008-02-09 23:24:08 +0100565 unsigned int level;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700566
Ingo Molnarf0646e42008-01-30 13:33:43 +0100567 pte = lookup_address(address, &level);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700568 BUG_ON(pte == NULL);
569
570 ptev = pte_mkwrite(*pte);
571
572 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
573 BUG();
574}
575
576
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700577static bool xen_page_pinned(void *ptr)
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100578{
579 struct page *page = virt_to_page(ptr);
580
581 return PagePinned(page);
582}
583
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800584static bool xen_iomap_pte(pte_t pte)
585{
Alex Nixon7347b402010-02-19 13:31:06 -0500586 return pte_flags(pte) & _PAGE_IOMAP;
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800587}
588
589static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
590{
591 struct multicall_space mcs;
592 struct mmu_update *u;
593
594 mcs = xen_mc_entry(sizeof(*u));
595 u = mcs.args;
596
597 /* ptep might be kmapped when using 32-bit HIGHPTE */
598 u->ptr = arbitrary_virt_to_machine(ptep).maddr;
599 u->val = pte_val_ma(pteval);
600
601 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_IO);
602
603 xen_mc_issue(PARAVIRT_LAZY_MMU);
604}
605
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700606static void xen_extend_mmu_update(const struct mmu_update *update)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700607{
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700608 struct multicall_space mcs;
609 struct mmu_update *u;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700610
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700611 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
612
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700613 if (mcs.mc != NULL) {
614 ADD_STATS(mmu_update_extended, 1);
615 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
616
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700617 mcs.mc->args[1]++;
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700618
619 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
620 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
621 else
622 ADD_STATS(mmu_update_histo[0], 1);
623 } else {
624 ADD_STATS(mmu_update, 1);
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700625 mcs = __xen_mc_entry(sizeof(*u));
626 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700627 ADD_STATS(mmu_update_histo[1], 1);
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700628 }
629
630 u = mcs.args;
631 *u = *update;
632}
633
634void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
635{
636 struct mmu_update u;
637
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700638 preempt_disable();
639
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700640 xen_mc_batch();
641
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700642 /* ptr may be ioremapped for 64-bit pagetable setup */
643 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700644 u.val = pmd_val_ma(val);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700645 xen_extend_mmu_update(&u);
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700646
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700647 ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
648
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700649 xen_mc_issue(PARAVIRT_LAZY_MMU);
650
651 preempt_enable();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700652}
653
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100654void xen_set_pmd(pmd_t *ptr, pmd_t val)
655{
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700656 ADD_STATS(pmd_update, 1);
657
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100658 /* If page is not pinned, we can just update the entry
659 directly */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700660 if (!xen_page_pinned(ptr)) {
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100661 *ptr = val;
662 return;
663 }
664
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700665 ADD_STATS(pmd_update_pinned, 1);
666
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100667 xen_set_pmd_hyper(ptr, val);
668}
669
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700670/*
671 * Associate a virtual page frame with a given physical page frame
672 * and protection flags for that frame.
673 */
674void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
675{
Jeremy Fitzhardinge836fe2f2008-07-08 15:06:58 -0700676 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700677}
678
679void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
680 pte_t *ptep, pte_t pteval)
681{
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800682 if (xen_iomap_pte(pteval)) {
683 xen_set_iomap_pte(ptep, pteval);
684 goto out;
685 }
686
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700687 ADD_STATS(set_pte_at, 1);
688// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
689 ADD_STATS(set_pte_at_current, mm == current->mm);
690 ADD_STATS(set_pte_at_kernel, mm == &init_mm);
691
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700692 if (mm == current->mm || mm == &init_mm) {
Jeremy Fitzhardinge8965c1c2007-10-16 11:51:29 -0700693 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700694 struct multicall_space mcs;
695 mcs = xen_mc_entry(0);
696
697 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700698 ADD_STATS(set_pte_at_batched, 1);
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700699 xen_mc_issue(PARAVIRT_LAZY_MMU);
Jeremy Fitzhardinge2bd50032008-04-02 10:54:10 -0700700 goto out;
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700701 } else
702 if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
Jeremy Fitzhardinge2bd50032008-04-02 10:54:10 -0700703 goto out;
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700704 }
705 xen_set_pte(ptep, pteval);
Jeremy Fitzhardinge2bd50032008-04-02 10:54:10 -0700706
Jeremy Fitzhardinge2829b442009-02-17 23:53:19 -0800707out: return;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700708}
709
Tejf63c2f22008-12-16 11:56:06 -0800710pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
711 unsigned long addr, pte_t *ptep)
Jeremy Fitzhardingee57778a2008-06-16 04:30:02 -0700712{
713 /* Just return the pte as-is. We preserve the bits on commit */
714 return *ptep;
715}
716
717void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
718 pte_t *ptep, pte_t pte)
719{
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700720 struct mmu_update u;
Jeremy Fitzhardingee57778a2008-06-16 04:30:02 -0700721
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700722 xen_mc_batch();
723
Chris Lalancette9f32d212008-10-23 17:40:25 -0700724 u.ptr = arbitrary_virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700725 u.val = pte_val_ma(pte);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700726 xen_extend_mmu_update(&u);
Jeremy Fitzhardingee57778a2008-06-16 04:30:02 -0700727
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700728 ADD_STATS(prot_commit, 1);
729 ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
730
Jeremy Fitzhardingee57778a2008-06-16 04:30:02 -0700731 xen_mc_issue(PARAVIRT_LAZY_MMU);
732}
733
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700734/* Assume pteval_t is equivalent to all the other *val_t types. */
735static pteval_t pte_mfn_to_pfn(pteval_t val)
736{
737 if (val & _PAGE_PRESENT) {
Jeremy Fitzhardinge59438c92008-07-21 22:59:42 -0700738 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
Jeremy Fitzhardinge77be1fa2008-07-21 22:59:56 -0700739 pteval_t flags = val & PTE_FLAGS_MASK;
Jeremy Fitzhardinged8355ac2008-07-03 22:10:18 -0700740 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700741 }
742
743 return val;
744}
745
746static pteval_t pte_pfn_to_mfn(pteval_t val)
747{
748 if (val & _PAGE_PRESENT) {
Jeremy Fitzhardinge59438c92008-07-21 22:59:42 -0700749 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
Jeremy Fitzhardinge77be1fa2008-07-21 22:59:56 -0700750 pteval_t flags = val & PTE_FLAGS_MASK;
Jeremy Fitzhardinged8355ac2008-07-03 22:10:18 -0700751 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700752 }
753
754 return val;
755}
756
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800757static pteval_t iomap_pte(pteval_t val)
758{
759 if (val & _PAGE_PRESENT) {
760 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
761 pteval_t flags = val & PTE_FLAGS_MASK;
762
763 /* We assume the pte frame number is a MFN, so
764 just use it as-is. */
765 val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
766 }
767
768 return val;
769}
770
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700771pteval_t xen_pte_val(pte_t pte)
772{
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800773 if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP))
774 return pte.pte;
775
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700776 return pte_mfn_to_pfn(pte.pte);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700777}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800778PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700779
780pgdval_t xen_pgd_val(pgd_t pgd)
781{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700782 return pte_mfn_to_pfn(pgd.pgd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700783}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800784PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700785
786pte_t xen_make_pte(pteval_t pte)
787{
Alex Nixon7347b402010-02-19 13:31:06 -0500788 phys_addr_t addr = (pte & PTE_PFN_MASK);
789
790 /*
791 * Unprivileged domains are allowed to do IOMAPpings for
792 * PCI passthrough, but not map ISA space. The ISA
793 * mappings are just dummy local mappings to keep other
794 * parts of the kernel happy.
795 */
796 if (unlikely(pte & _PAGE_IOMAP) &&
797 (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800798 pte = iomap_pte(pte);
Alex Nixon7347b402010-02-19 13:31:06 -0500799 } else {
800 pte &= ~_PAGE_IOMAP;
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800801 pte = pte_pfn_to_mfn(pte);
Alex Nixon7347b402010-02-19 13:31:06 -0500802 }
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800803
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700804 return native_make_pte(pte);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700805}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800806PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700807
808pgd_t xen_make_pgd(pgdval_t pgd)
809{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700810 pgd = pte_pfn_to_mfn(pgd);
811 return native_make_pgd(pgd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700812}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800813PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700814
815pmdval_t xen_pmd_val(pmd_t pmd)
816{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700817 return pte_mfn_to_pfn(pmd.pmd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700818}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800819PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
Jeremy Fitzhardinge28499142008-05-09 12:05:57 +0100820
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100821void xen_set_pud_hyper(pud_t *ptr, pud_t val)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700822{
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700823 struct mmu_update u;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700824
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700825 preempt_disable();
826
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700827 xen_mc_batch();
828
Jeremy Fitzhardingece803e72008-07-08 15:06:55 -0700829 /* ptr may be ioremapped for 64-bit pagetable setup */
830 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
Jeremy Fitzhardinge400d3492008-06-16 04:30:03 -0700831 u.val = pud_val_ma(val);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700832 xen_extend_mmu_update(&u);
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700833
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700834 ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
835
Jeremy Fitzhardinged66bf8f2007-07-17 18:37:06 -0700836 xen_mc_issue(PARAVIRT_LAZY_MMU);
837
838 preempt_enable();
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700839}
840
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100841void xen_set_pud(pud_t *ptr, pud_t val)
842{
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700843 ADD_STATS(pud_update, 1);
844
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100845 /* If page is not pinned, we can just update the entry
846 directly */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700847 if (!xen_page_pinned(ptr)) {
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100848 *ptr = val;
849 return;
850 }
851
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700852 ADD_STATS(pud_update_pinned, 1);
853
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100854 xen_set_pud_hyper(ptr, val);
855}
856
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700857void xen_set_pte(pte_t *ptep, pte_t pte)
858{
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800859 if (xen_iomap_pte(pte)) {
860 xen_set_iomap_pte(ptep, pte);
861 return;
862 }
863
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700864 ADD_STATS(pte_update, 1);
865// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
866 ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
867
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700868#ifdef CONFIG_X86_PAE
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700869 ptep->pte_high = pte.pte_high;
870 smp_wmb();
871 ptep->pte_low = pte.pte_low;
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700872#else
873 *ptep = pte;
874#endif
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -0700875}
876
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700877#ifdef CONFIG_X86_PAE
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700878void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
879{
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -0800880 if (xen_iomap_pte(pte)) {
881 xen_set_iomap_pte(ptep, pte);
882 return;
883 }
884
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700885 set_64bit((u64 *)ptep, native_pte_val(pte));
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700886}
887
888void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
889{
890 ptep->pte_low = 0;
891 smp_wmb(); /* make sure low gets written first */
892 ptep->pte_high = 0;
893}
894
895void xen_pmd_clear(pmd_t *pmdp)
896{
Jeremy Fitzhardingee2426cf2008-05-31 01:24:27 +0100897 set_pmd(pmdp, __pmd(0));
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700898}
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700899#endif /* CONFIG_X86_PAE */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700900
Jeremy Fitzhardingeabf33032008-03-17 16:37:07 -0700901pmd_t xen_make_pmd(pmdval_t pmd)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700902{
Jeremy Fitzhardingeebb9cfe2008-06-16 15:01:56 -0700903 pmd = pte_pfn_to_mfn(pmd);
Jeremy Fitzhardinge947a69c2008-03-17 16:37:09 -0700904 return native_make_pmd(pmd);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700905}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800906PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -0700907
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700908#if PAGETABLE_LEVELS == 4
909pudval_t xen_pud_val(pud_t pud)
910{
911 return pte_mfn_to_pfn(pud.pud);
912}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800913PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700914
915pud_t xen_make_pud(pudval_t pud)
916{
917 pud = pte_pfn_to_mfn(pud);
918
919 return native_make_pud(pud);
920}
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -0800921PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700922
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700923pgd_t *xen_get_user_pgd(pgd_t *pgd)
924{
925 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
926 unsigned offset = pgd - pgd_page;
927 pgd_t *user_ptr = NULL;
928
929 if (offset < pgd_index(USER_LIMIT)) {
930 struct page *page = virt_to_page(pgd_page);
931 user_ptr = (pgd_t *)page->private;
932 if (user_ptr)
933 user_ptr += offset;
934 }
935
936 return user_ptr;
937}
938
939static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700940{
941 struct mmu_update u;
942
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700943 u.ptr = virt_to_machine(ptr).maddr;
944 u.val = pgd_val_ma(val);
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700945 xen_extend_mmu_update(&u);
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700946}
947
948/*
949 * Raw hypercall-based set_pgd, intended for in early boot before
950 * there's a page structure. This implies:
951 * 1. The only existing pagetable is the kernel's
952 * 2. It is always pinned
953 * 3. It has no user pagetable attached to it
954 */
955void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
956{
957 preempt_disable();
958
959 xen_mc_batch();
960
961 __xen_set_pgd_hyper(ptr, val);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700962
963 xen_mc_issue(PARAVIRT_LAZY_MMU);
964
965 preempt_enable();
966}
967
968void xen_set_pgd(pgd_t *ptr, pgd_t val)
969{
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700970 pgd_t *user_ptr = xen_get_user_pgd(ptr);
971
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700972 ADD_STATS(pgd_update, 1);
973
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700974 /* If page is not pinned, we can just update the entry
975 directly */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700976 if (!xen_page_pinned(ptr)) {
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700977 *ptr = val;
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700978 if (user_ptr) {
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -0700979 WARN_ON(xen_page_pinned(user_ptr));
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700980 *user_ptr = val;
981 }
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700982 return;
983 }
984
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -0700985 ADD_STATS(pgd_update_pinned, 1);
986 ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
987
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -0700988 /* If it's pinned, then we can at least batch the kernel and
989 user updates together. */
990 xen_mc_batch();
991
992 __xen_set_pgd_hyper(ptr, val);
993 if (user_ptr)
994 __xen_set_pgd_hyper(user_ptr, val);
995
996 xen_mc_issue(PARAVIRT_LAZY_MMU);
Jeremy Fitzhardingef6e58732008-07-08 15:06:38 -0700997}
998#endif /* PAGETABLE_LEVELS == 4 */
999
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001000/*
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001001 * (Yet another) pagetable walker. This one is intended for pinning a
1002 * pagetable. This means that it walks a pagetable and calls the
1003 * callback function on each page it finds making up the page table,
1004 * at every level. It walks the entire pagetable, but it only bothers
1005 * pinning pte pages which are below limit. In the normal case this
1006 * will be STACK_TOP_MAX, but at boot we need to pin up to
1007 * FIXADDR_TOP.
1008 *
1009 * For 32-bit the important bit is that we don't pin beyond there,
1010 * because then we start getting into Xen's ptes.
1011 *
1012 * For 64-bit, we must skip the Xen hole in the middle of the address
1013 * space, just after the big x86-64 virtual hole.
1014 */
Ian Campbell86bbc2c2008-11-21 10:21:33 +00001015static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
1016 int (*func)(struct mm_struct *mm, struct page *,
1017 enum pt_level),
1018 unsigned long limit)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001019{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001020 int flush = 0;
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001021 unsigned hole_low, hole_high;
1022 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
1023 unsigned pgdidx, pudidx, pmdidx;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001024
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001025 /* The limit is the last byte to be touched */
1026 limit--;
1027 BUG_ON(limit >= FIXADDR_TOP);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001028
1029 if (xen_feature(XENFEAT_auto_translated_physmap))
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001030 return 0;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001031
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001032 /*
1033 * 64-bit has a great big hole in the middle of the address
1034 * space, which contains the Xen mappings. On 32-bit these
1035 * will end up making a zero-sized hole and so is a no-op.
1036 */
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -07001037 hole_low = pgd_index(USER_LIMIT);
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001038 hole_high = pgd_index(PAGE_OFFSET);
1039
1040 pgdidx_limit = pgd_index(limit);
1041#if PTRS_PER_PUD > 1
1042 pudidx_limit = pud_index(limit);
1043#else
1044 pudidx_limit = 0;
1045#endif
1046#if PTRS_PER_PMD > 1
1047 pmdidx_limit = pmd_index(limit);
1048#else
1049 pmdidx_limit = 0;
1050#endif
1051
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001052 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001053 pud_t *pud;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001054
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001055 if (pgdidx >= hole_low && pgdidx < hole_high)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001056 continue;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001057
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001058 if (!pgd_val(pgd[pgdidx]))
1059 continue;
1060
1061 pud = pud_offset(&pgd[pgdidx], 0);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001062
1063 if (PTRS_PER_PUD > 1) /* not folded */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001064 flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001065
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001066 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001067 pmd_t *pmd;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001068
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001069 if (pgdidx == pgdidx_limit &&
1070 pudidx > pudidx_limit)
1071 goto out;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001072
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001073 if (pud_none(pud[pudidx]))
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001074 continue;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001075
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001076 pmd = pmd_offset(&pud[pudidx], 0);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001077
1078 if (PTRS_PER_PMD > 1) /* not folded */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001079 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001080
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001081 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
1082 struct page *pte;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001083
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001084 if (pgdidx == pgdidx_limit &&
1085 pudidx == pudidx_limit &&
1086 pmdidx > pmdidx_limit)
1087 goto out;
1088
1089 if (pmd_none(pmd[pmdidx]))
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001090 continue;
1091
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001092 pte = pmd_page(pmd[pmdidx]);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001093 flush |= (*func)(mm, pte, PT_PTE);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001094 }
1095 }
1096 }
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -07001097
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001098out:
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -07001099 /* Do the top level last, so that the callbacks can use it as
1100 a cue to do final things like tlb flushes. */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001101 flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001102
1103 return flush;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001104}
1105
Ian Campbell86bbc2c2008-11-21 10:21:33 +00001106static int xen_pgd_walk(struct mm_struct *mm,
1107 int (*func)(struct mm_struct *mm, struct page *,
1108 enum pt_level),
1109 unsigned long limit)
1110{
1111 return __xen_pgd_walk(mm, mm->pgd, func, limit);
1112}
1113
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001114/* If we're using split pte locks, then take the page's lock and
1115 return a pointer to it. Otherwise return NULL. */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001116static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001117{
1118 spinlock_t *ptl = NULL;
1119
Jeremy Fitzhardingef7d0b922008-09-09 15:43:22 -07001120#if USE_SPLIT_PTLOCKS
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001121 ptl = __pte_lockptr(page);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001122 spin_lock_nest_lock(ptl, &mm->page_table_lock);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001123#endif
1124
1125 return ptl;
1126}
1127
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001128static void xen_pte_unlock(void *v)
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001129{
1130 spinlock_t *ptl = v;
1131 spin_unlock(ptl);
1132}
1133
1134static void xen_do_pin(unsigned level, unsigned long pfn)
1135{
1136 struct mmuext_op *op;
1137 struct multicall_space mcs;
1138
1139 mcs = __xen_mc_entry(sizeof(*op));
1140 op = mcs.args;
1141 op->cmd = level;
1142 op->arg1.mfn = pfn_to_mfn(pfn);
1143 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1144}
1145
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001146static int xen_pin_page(struct mm_struct *mm, struct page *page,
1147 enum pt_level level)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001148{
Christoph Lameterd60cd462008-04-28 02:12:51 -07001149 unsigned pgfl = TestSetPagePinned(page);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001150 int flush;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001151
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001152 if (pgfl)
1153 flush = 0; /* already pinned */
1154 else if (PageHighMem(page))
1155 /* kmaps need flushing if we found an unpinned
1156 highpage */
1157 flush = 1;
1158 else {
1159 void *pt = lowmem_page_address(page);
1160 unsigned long pfn = page_to_pfn(page);
1161 struct multicall_space mcs = __xen_mc_entry(0);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001162 spinlock_t *ptl;
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001163
1164 flush = 0;
1165
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -07001166 /*
1167 * We need to hold the pagetable lock between the time
1168 * we make the pagetable RO and when we actually pin
1169 * it. If we don't, then other users may come in and
1170 * attempt to update the pagetable by writing it,
1171 * which will fail because the memory is RO but not
1172 * pinned, so Xen won't do the trap'n'emulate.
1173 *
1174 * If we're using split pte locks, we can't hold the
1175 * entire pagetable's worth of locks during the
1176 * traverse, because we may wrap the preempt count (8
1177 * bits). The solution is to mark RO and pin each PTE
1178 * page while holding the lock. This means the number
1179 * of locks we end up holding is never more than a
1180 * batch size (~32 entries, at present).
1181 *
1182 * If we're not using split pte locks, we needn't pin
1183 * the PTE pages independently, because we're
1184 * protected by the overall pagetable lock.
1185 */
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001186 ptl = NULL;
1187 if (level == PT_PTE)
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001188 ptl = xen_pte_lock(page, mm);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001189
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001190 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
1191 pfn_pte(pfn, PAGE_KERNEL_RO),
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001192 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
1193
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -07001194 if (ptl) {
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001195 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
1196
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001197 /* Queue a deferred unlock for when this batch
1198 is completed. */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001199 xen_mc_callback(xen_pte_unlock, ptl);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001200 }
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001201 }
1202
1203 return flush;
1204}
1205
1206/* This is called just after a mm has been created, but it has not
1207 been used yet. We need to make sure that its pagetable is all
1208 read-only, and can be pinned. */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001209static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001210{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001211 xen_mc_batch();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001212
Ian Campbell86bbc2c2008-11-21 10:21:33 +00001213 if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
Jeremy Fitzhardinged05fdf32008-10-28 19:23:06 +11001214 /* re-enable interrupts for flushing */
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001215 xen_mc_issue(0);
Jeremy Fitzhardinged05fdf32008-10-28 19:23:06 +11001216
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001217 kmap_flush_unused();
Jeremy Fitzhardinged05fdf32008-10-28 19:23:06 +11001218
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001219 xen_mc_batch();
1220 }
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001221
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -07001222#ifdef CONFIG_X86_64
1223 {
1224 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1225
1226 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
1227
1228 if (user_pgd) {
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001229 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
Tejf63c2f22008-12-16 11:56:06 -08001230 xen_do_pin(MMUEXT_PIN_L4_TABLE,
1231 PFN_DOWN(__pa(user_pgd)));
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -07001232 }
1233 }
1234#else /* CONFIG_X86_32 */
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001235#ifdef CONFIG_X86_PAE
1236 /* Need to make sure unshared kernel PMD is pinnable */
Jeremy Fitzhardinge47cb2ed2008-11-06 13:48:24 -08001237 xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001238 PT_PMD);
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001239#endif
Jeremy Fitzhardinge28499142008-05-09 12:05:57 +01001240 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -07001241#endif /* CONFIG_X86_64 */
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001242 xen_mc_issue(0);
1243}
1244
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001245static void xen_pgd_pin(struct mm_struct *mm)
1246{
1247 __xen_pgd_pin(mm, mm->pgd);
1248}
1249
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001250/*
1251 * On save, we need to pin all pagetables to make sure they get their
1252 * mfns turned into pfns. Search the list for any unpinned pgds and pin
1253 * them (unpinned pgds are not currently in use, probably because the
1254 * process is under construction or destruction).
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001255 *
1256 * Expected to be called in stop_machine() ("equivalent to taking
1257 * every spinlock in the system"), so the locking doesn't really
1258 * matter all that much.
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001259 */
1260void xen_mm_pin_all(void)
1261{
1262 unsigned long flags;
1263 struct page *page;
1264
1265 spin_lock_irqsave(&pgd_lock, flags);
1266
1267 list_for_each_entry(page, &pgd_list, lru) {
1268 if (!PagePinned(page)) {
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001269 __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001270 SetPageSavePinned(page);
1271 }
1272 }
1273
1274 spin_unlock_irqrestore(&pgd_lock, flags);
1275}
1276
Eduardo Habkostc1f2f092008-07-08 15:06:24 -07001277/*
1278 * The init_mm pagetable is really pinned as soon as its created, but
1279 * that's before we have page structures to store the bits. So do all
1280 * the book-keeping now.
1281 */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001282static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
1283 enum pt_level level)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001284{
1285 SetPagePinned(page);
1286 return 0;
1287}
1288
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001289static void __init xen_mark_init_mm_pinned(void)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001290{
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001291 xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001292}
1293
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001294static int xen_unpin_page(struct mm_struct *mm, struct page *page,
1295 enum pt_level level)
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001296{
Christoph Lameterd60cd462008-04-28 02:12:51 -07001297 unsigned pgfl = TestClearPagePinned(page);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001298
1299 if (pgfl && !PageHighMem(page)) {
1300 void *pt = lowmem_page_address(page);
1301 unsigned long pfn = page_to_pfn(page);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001302 spinlock_t *ptl = NULL;
1303 struct multicall_space mcs;
1304
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -07001305 /*
1306 * Do the converse to pin_page. If we're using split
1307 * pte locks, we must be holding the lock for while
1308 * the pte page is unpinned but still RO to prevent
1309 * concurrent updates from seeing it in this
1310 * partially-pinned state.
1311 */
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001312 if (level == PT_PTE) {
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001313 ptl = xen_pte_lock(page, mm);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001314
Jeremy Fitzhardinge11ad93e2008-08-19 13:32:51 -07001315 if (ptl)
1316 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001317 }
1318
1319 mcs = __xen_mc_entry(0);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001320
1321 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
1322 pfn_pte(pfn, PAGE_KERNEL),
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001323 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
1324
1325 if (ptl) {
1326 /* unlock when batch completed */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001327 xen_mc_callback(xen_pte_unlock, ptl);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001328 }
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001329 }
1330
1331 return 0; /* never need to flush on unpin */
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001332}
1333
1334/* Release a pagetables pages back as normal RW */
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001335static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001336{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001337 xen_mc_batch();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001338
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001339 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001340
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -07001341#ifdef CONFIG_X86_64
1342 {
1343 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1344
1345 if (user_pgd) {
Tejf63c2f22008-12-16 11:56:06 -08001346 xen_do_pin(MMUEXT_UNPIN_TABLE,
1347 PFN_DOWN(__pa(user_pgd)));
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001348 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -07001349 }
1350 }
1351#endif
1352
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001353#ifdef CONFIG_X86_PAE
1354 /* Need to make sure unshared kernel PMD is unpinned */
Jeremy Fitzhardinge47cb2ed2008-11-06 13:48:24 -08001355 xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001356 PT_PMD);
Jeremy Fitzhardinge5deb30d2008-07-08 15:07:06 -07001357#endif
Jeremy Fitzhardinged6182fb2008-07-08 15:07:13 -07001358
Ian Campbell86bbc2c2008-11-21 10:21:33 +00001359 __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001360
1361 xen_mc_issue(0);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001362}
1363
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001364static void xen_pgd_unpin(struct mm_struct *mm)
1365{
1366 __xen_pgd_unpin(mm, mm->pgd);
1367}
1368
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001369/*
1370 * On resume, undo any pinning done at save, so that the rest of the
1371 * kernel doesn't see any unexpected pinned pagetables.
1372 */
1373void xen_mm_unpin_all(void)
1374{
1375 unsigned long flags;
1376 struct page *page;
1377
1378 spin_lock_irqsave(&pgd_lock, flags);
1379
1380 list_for_each_entry(page, &pgd_list, lru) {
1381 if (PageSavePinned(page)) {
1382 BUG_ON(!PagePinned(page));
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001383 __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
Jeremy Fitzhardinge0e913982008-05-26 23:31:27 +01001384 ClearPageSavePinned(page);
1385 }
1386 }
1387
1388 spin_unlock_irqrestore(&pgd_lock, flags);
1389}
1390
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001391void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
1392{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001393 spin_lock(&next->page_table_lock);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001394 xen_pgd_pin(next);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001395 spin_unlock(&next->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001396}
1397
1398void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
1399{
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001400 spin_lock(&mm->page_table_lock);
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001401 xen_pgd_pin(mm);
Jeremy Fitzhardingef4f97b32007-07-17 18:37:05 -07001402 spin_unlock(&mm->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001403}
1404
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001405
1406#ifdef CONFIG_SMP
1407/* Another cpu may still have their %cr3 pointing at the pagetable, so
1408 we need to repoint it somewhere else before we can unpin it. */
1409static void drop_other_mm_ref(void *info)
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001410{
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001411 struct mm_struct *mm = info;
Jeremy Fitzhardingece87b3d2008-07-08 15:06:40 -07001412 struct mm_struct *active_mm;
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001413
Brian Gerst9eb912d2009-01-19 00:38:57 +09001414 active_mm = percpu_read(cpu_tlbstate.active_mm);
Jeremy Fitzhardingece87b3d2008-07-08 15:06:40 -07001415
1416 if (active_mm == mm)
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001417 leave_mm(smp_processor_id());
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001418
1419 /* If this cpu still has a stale cr3 reference, then make sure
1420 it has been flushed. */
Jeremy Fitzhardinge7fd7d832009-02-17 23:24:03 -08001421 if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001422 load_cr3(swapper_pg_dir);
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001423}
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001424
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001425static void xen_drop_mm_ref(struct mm_struct *mm)
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001426{
Mike Travise4d98202008-12-16 17:34:05 -08001427 cpumask_var_t mask;
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001428 unsigned cpu;
1429
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001430 if (current->active_mm == mm) {
1431 if (current->mm == mm)
1432 load_cr3(swapper_pg_dir);
1433 else
1434 leave_mm(smp_processor_id());
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001435 }
1436
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001437 /* Get the "official" set of cpus referring to our pagetable. */
Mike Travise4d98202008-12-16 17:34:05 -08001438 if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1439 for_each_online_cpu(cpu) {
Rusty Russell78f1c4d2009-09-24 09:34:51 -06001440 if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
Mike Travise4d98202008-12-16 17:34:05 -08001441 && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1442 continue;
1443 smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1444 }
1445 return;
1446 }
Rusty Russell78f1c4d2009-09-24 09:34:51 -06001447 cpumask_copy(mask, mm_cpumask(mm));
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001448
1449 /* It's possible that a vcpu may have a stale reference to our
1450 cr3, because its in lazy mode, and it hasn't yet flushed
1451 its set of pending hypercalls yet. In this case, we can
1452 look at its actual current cr3 value, and force it to flush
1453 if needed. */
1454 for_each_online_cpu(cpu) {
1455 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
Mike Travise4d98202008-12-16 17:34:05 -08001456 cpumask_set_cpu(cpu, mask);
Jeremy Fitzhardinge9f799912007-10-16 11:51:30 -07001457 }
1458
Mike Travise4d98202008-12-16 17:34:05 -08001459 if (!cpumask_empty(mask))
1460 smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1461 free_cpumask_var(mask);
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001462}
1463#else
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001464static void xen_drop_mm_ref(struct mm_struct *mm)
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001465{
1466 if (current->active_mm == mm)
1467 load_cr3(swapper_pg_dir);
1468}
1469#endif
1470
1471/*
1472 * While a process runs, Xen pins its pagetables, which means that the
1473 * hypervisor forces it to be read-only, and it controls all updates
1474 * to it. This means that all pagetable updates have to go via the
1475 * hypervisor, which is moderately expensive.
1476 *
1477 * Since we're pulling the pagetable down, we switch to use init_mm,
1478 * unpin old process pagetable and mark it all read-write, which
1479 * allows further operations on it to be simple memory accesses.
1480 *
1481 * The only subtle point is that another CPU may be still using the
1482 * pagetable because of lazy tlb flushing. This means we need need to
1483 * switch all CPUs off this pagetable before we can unpin it.
1484 */
1485void xen_exit_mmap(struct mm_struct *mm)
1486{
1487 get_cpu(); /* make sure we don't move around */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001488 xen_drop_mm_ref(mm);
Jeremy Fitzhardingef87e4ca2007-07-17 18:37:06 -07001489 put_cpu();
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001490
Jeremy Fitzhardingef120f132007-07-17 18:37:06 -07001491 spin_lock(&mm->page_table_lock);
Jeremy Fitzhardingedf912ea2007-09-25 11:50:00 -07001492
1493 /* pgd may not be pinned in the error exit path of execve */
Jeremy Fitzhardinge7708ad62008-08-19 13:34:22 -07001494 if (xen_page_pinned(mm->pgd))
Jeremy Fitzhardingeeefb47f2008-10-08 13:01:39 -07001495 xen_pgd_unpin(mm);
Jeremy Fitzhardinge74260712007-10-16 11:51:30 -07001496
Jeremy Fitzhardingef120f132007-07-17 18:37:06 -07001497 spin_unlock(&mm->page_table_lock);
Jeremy Fitzhardinge3b827c12007-07-17 18:37:04 -07001498}
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -07001499
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001500static __init void xen_pagetable_setup_start(pgd_t *base)
1501{
1502}
1503
Thomas Gleixnerf1d70622009-08-20 13:13:52 +02001504static void xen_post_allocator_init(void);
1505
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001506static __init void xen_pagetable_setup_done(pgd_t *base)
1507{
1508 xen_setup_shared_info();
Thomas Gleixnerf1d70622009-08-20 13:13:52 +02001509 xen_post_allocator_init();
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001510}
1511
1512static void xen_write_cr2(unsigned long cr2)
1513{
1514 percpu_read(xen_vcpu)->arch.cr2 = cr2;
1515}
1516
1517static unsigned long xen_read_cr2(void)
1518{
1519 return percpu_read(xen_vcpu)->arch.cr2;
1520}
1521
1522unsigned long xen_read_cr2_direct(void)
1523{
1524 return percpu_read(xen_vcpu_info.arch.cr2);
1525}
1526
1527static void xen_flush_tlb(void)
1528{
1529 struct mmuext_op *op;
1530 struct multicall_space mcs;
1531
1532 preempt_disable();
1533
1534 mcs = xen_mc_entry(sizeof(*op));
1535
1536 op = mcs.args;
1537 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1538 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1539
1540 xen_mc_issue(PARAVIRT_LAZY_MMU);
1541
1542 preempt_enable();
1543}
1544
1545static void xen_flush_tlb_single(unsigned long addr)
1546{
1547 struct mmuext_op *op;
1548 struct multicall_space mcs;
1549
1550 preempt_disable();
1551
1552 mcs = xen_mc_entry(sizeof(*op));
1553 op = mcs.args;
1554 op->cmd = MMUEXT_INVLPG_LOCAL;
1555 op->arg1.linear_addr = addr & PAGE_MASK;
1556 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1557
1558 xen_mc_issue(PARAVIRT_LAZY_MMU);
1559
1560 preempt_enable();
1561}
1562
1563static void xen_flush_tlb_others(const struct cpumask *cpus,
1564 struct mm_struct *mm, unsigned long va)
1565{
1566 struct {
1567 struct mmuext_op op;
1568 DECLARE_BITMAP(mask, NR_CPUS);
1569 } *args;
1570 struct multicall_space mcs;
1571
Jeremy Fitzhardingee3f8a742009-03-04 17:36:57 -08001572 if (cpumask_empty(cpus))
1573 return; /* nothing to do */
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001574
1575 mcs = xen_mc_entry(sizeof(*args));
1576 args = mcs.args;
1577 args->op.arg2.vcpumask = to_cpumask(args->mask);
1578
1579 /* Remove us, and any offline CPUS. */
1580 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1581 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001582
1583 if (va == TLB_FLUSH_ALL) {
1584 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1585 } else {
1586 args->op.cmd = MMUEXT_INVLPG_MULTI;
1587 args->op.arg1.linear_addr = va;
1588 }
1589
1590 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1591
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001592 xen_mc_issue(PARAVIRT_LAZY_MMU);
1593}
1594
1595static unsigned long xen_read_cr3(void)
1596{
1597 return percpu_read(xen_cr3);
1598}
1599
1600static void set_current_cr3(void *v)
1601{
1602 percpu_write(xen_current_cr3, (unsigned long)v);
1603}
1604
1605static void __xen_write_cr3(bool kernel, unsigned long cr3)
1606{
1607 struct mmuext_op *op;
1608 struct multicall_space mcs;
1609 unsigned long mfn;
1610
1611 if (cr3)
1612 mfn = pfn_to_mfn(PFN_DOWN(cr3));
1613 else
1614 mfn = 0;
1615
1616 WARN_ON(mfn == 0 && kernel);
1617
1618 mcs = __xen_mc_entry(sizeof(*op));
1619
1620 op = mcs.args;
1621 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1622 op->arg1.mfn = mfn;
1623
1624 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1625
1626 if (kernel) {
1627 percpu_write(xen_cr3, cr3);
1628
1629 /* Update xen_current_cr3 once the batch has actually
1630 been submitted. */
1631 xen_mc_callback(set_current_cr3, (void *)cr3);
1632 }
1633}
1634
1635static void xen_write_cr3(unsigned long cr3)
1636{
1637 BUG_ON(preemptible());
1638
1639 xen_mc_batch(); /* disables interrupts */
1640
1641 /* Update while interrupts are disabled, so its atomic with
1642 respect to ipis */
1643 percpu_write(xen_cr3, cr3);
1644
1645 __xen_write_cr3(true, cr3);
1646
1647#ifdef CONFIG_X86_64
1648 {
1649 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1650 if (user_pgd)
1651 __xen_write_cr3(false, __pa(user_pgd));
1652 else
1653 __xen_write_cr3(false, 0);
1654 }
1655#endif
1656
1657 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
1658}
1659
1660static int xen_pgd_alloc(struct mm_struct *mm)
1661{
1662 pgd_t *pgd = mm->pgd;
1663 int ret = 0;
1664
1665 BUG_ON(PagePinned(virt_to_page(pgd)));
1666
1667#ifdef CONFIG_X86_64
1668 {
1669 struct page *page = virt_to_page(pgd);
1670 pgd_t *user_pgd;
1671
1672 BUG_ON(page->private != 0);
1673
1674 ret = -ENOMEM;
1675
1676 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1677 page->private = (unsigned long)user_pgd;
1678
1679 if (user_pgd != NULL) {
1680 user_pgd[pgd_index(VSYSCALL_START)] =
1681 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1682 ret = 0;
1683 }
1684
1685 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1686 }
1687#endif
1688
1689 return ret;
1690}
1691
1692static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1693{
1694#ifdef CONFIG_X86_64
1695 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1696
1697 if (user_pgd)
1698 free_page((unsigned long)user_pgd);
1699#endif
1700}
1701
Jeremy Fitzhardinge1f4f9312009-02-02 13:58:06 -08001702#ifdef CONFIG_X86_32
1703static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1704{
1705 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1706 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1707 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1708 pte_val_ma(pte));
1709
1710 return pte;
1711}
1712
1713/* Init-time set_pte while constructing initial pagetables, which
1714 doesn't allow RO pagetable pages to be remapped RW */
1715static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1716{
1717 pte = mask_rw_pte(ptep, pte);
1718
1719 xen_set_pte(ptep, pte);
1720}
1721#endif
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001722
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001723static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1724{
1725 struct mmuext_op op;
1726 op.cmd = cmd;
1727 op.arg1.mfn = pfn_to_mfn(pfn);
1728 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1729 BUG();
1730}
1731
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001732/* Early in boot, while setting up the initial pagetable, assume
1733 everything is pinned. */
1734static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1735{
1736#ifdef CONFIG_FLATMEM
1737 BUG_ON(mem_map); /* should only be used early */
1738#endif
1739 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001740 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1741}
1742
1743/* Used for pmd and pud */
1744static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1745{
1746#ifdef CONFIG_FLATMEM
1747 BUG_ON(mem_map); /* should only be used early */
1748#endif
1749 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001750}
1751
1752/* Early release_pte assumes that all pts are pinned, since there's
1753 only init_mm and anything attached to that is pinned. */
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001754static __init void xen_release_pte_init(unsigned long pfn)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001755{
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001756 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001757 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1758}
1759
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001760static __init void xen_release_pmd_init(unsigned long pfn)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001761{
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07001762 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001763}
1764
1765/* This needs to make sure the new pte page is pinned iff its being
1766 attached to a pinned pagetable. */
1767static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
1768{
1769 struct page *page = pfn_to_page(pfn);
1770
1771 if (PagePinned(virt_to_page(mm->pgd))) {
1772 SetPagePinned(page);
1773
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001774 if (!PageHighMem(page)) {
1775 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
1776 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1777 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1778 } else {
1779 /* make sure there are no stray mappings of
1780 this page */
1781 kmap_flush_unused();
1782 }
1783 }
1784}
1785
1786static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1787{
1788 xen_alloc_ptpage(mm, pfn, PT_PTE);
1789}
1790
1791static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1792{
1793 xen_alloc_ptpage(mm, pfn, PT_PMD);
1794}
1795
1796/* This should never happen until we're OK to use struct page */
1797static void xen_release_ptpage(unsigned long pfn, unsigned level)
1798{
1799 struct page *page = pfn_to_page(pfn);
1800
1801 if (PagePinned(page)) {
1802 if (!PageHighMem(page)) {
1803 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1804 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1805 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1806 }
1807 ClearPagePinned(page);
1808 }
1809}
1810
1811static void xen_release_pte(unsigned long pfn)
1812{
1813 xen_release_ptpage(pfn, PT_PTE);
1814}
1815
1816static void xen_release_pmd(unsigned long pfn)
1817{
1818 xen_release_ptpage(pfn, PT_PMD);
1819}
1820
1821#if PAGETABLE_LEVELS == 4
1822static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1823{
1824 xen_alloc_ptpage(mm, pfn, PT_PUD);
1825}
1826
1827static void xen_release_pud(unsigned long pfn)
1828{
1829 xen_release_ptpage(pfn, PT_PUD);
1830}
1831#endif
1832
1833void __init xen_reserve_top(void)
1834{
1835#ifdef CONFIG_X86_32
1836 unsigned long top = HYPERVISOR_VIRT_START;
1837 struct xen_platform_parameters pp;
1838
1839 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1840 top = pp.virt_start;
1841
1842 reserve_top_address(-top);
1843#endif /* CONFIG_X86_32 */
1844}
1845
1846/*
1847 * Like __va(), but returns address in the kernel mapping (which is
1848 * all we have until the physical memory mapping has been set up.
1849 */
1850static void *__ka(phys_addr_t paddr)
1851{
1852#ifdef CONFIG_X86_64
1853 return (void *)(paddr + __START_KERNEL_map);
1854#else
1855 return __va(paddr);
1856#endif
1857}
1858
1859/* Convert a machine address to physical address */
1860static unsigned long m2p(phys_addr_t maddr)
1861{
1862 phys_addr_t paddr;
1863
1864 maddr &= PTE_PFN_MASK;
1865 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1866
1867 return paddr;
1868}
1869
1870/* Convert a machine address to kernel virtual */
1871static void *m2v(phys_addr_t maddr)
1872{
1873 return __ka(m2p(maddr));
1874}
1875
1876static void set_page_prot(void *addr, pgprot_t prot)
1877{
1878 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1879 pte_t pte = pfn_pte(pfn, prot);
1880
1881 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1882 BUG();
1883}
1884
1885static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1886{
1887 unsigned pmdidx, pteidx;
1888 unsigned ident_pte;
1889 unsigned long pfn;
1890
Jeremy Fitzhardinge764f01382010-08-26 16:23:51 -07001891 level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
1892 PAGE_SIZE);
1893
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001894 ident_pte = 0;
1895 pfn = 0;
1896 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1897 pte_t *pte_page;
1898
1899 /* Reuse or allocate a page of ptes */
1900 if (pmd_present(pmd[pmdidx]))
1901 pte_page = m2v(pmd[pmdidx].pmd);
1902 else {
1903 /* Check for free pte pages */
Jeremy Fitzhardinge764f01382010-08-26 16:23:51 -07001904 if (ident_pte == LEVEL1_IDENT_ENTRIES)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08001905 break;
1906
1907 pte_page = &level1_ident_pgt[ident_pte];
1908 ident_pte += PTRS_PER_PTE;
1909
1910 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1911 }
1912
1913 /* Install mappings */
1914 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1915 pte_t pte;
1916
1917 if (pfn > max_pfn_mapped)
1918 max_pfn_mapped = pfn;
1919
1920 if (!pte_none(pte_page[pteidx]))
1921 continue;
1922
1923 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1924 pte_page[pteidx] = pte;
1925 }
1926 }
1927
1928 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1929 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1930
1931 set_page_prot(pmd, PAGE_KERNEL_RO);
1932}
1933
1934#ifdef CONFIG_X86_64
1935static void convert_pfn_mfn(void *v)
1936{
1937 pte_t *pte = v;
1938 int i;
1939
1940 /* All levels are converted the same way, so just treat them
1941 as ptes. */
1942 for (i = 0; i < PTRS_PER_PTE; i++)
1943 pte[i] = xen_make_pte(pte[i].pte);
1944}
1945
1946/*
1947 * Set up the inital kernel pagetable.
1948 *
1949 * We can construct this by grafting the Xen provided pagetable into
1950 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1951 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1952 * means that only the kernel has a physical mapping to start with -
1953 * but that's enough to get __va working. We need to fill in the rest
1954 * of the physical mapping once some sort of allocator has been set
1955 * up.
1956 */
1957__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1958 unsigned long max_pfn)
1959{
1960 pud_t *l3;
1961 pmd_t *l2;
1962
1963 /* Zap identity mapping */
1964 init_level4_pgt[0] = __pgd(0);
1965
1966 /* Pre-constructed entries are in pfn, so convert to mfn */
1967 convert_pfn_mfn(init_level4_pgt);
1968 convert_pfn_mfn(level3_ident_pgt);
1969 convert_pfn_mfn(level3_kernel_pgt);
1970
1971 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1972 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1973
1974 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1975 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1976
1977 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1978 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1979 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1980
1981 /* Set up identity map */
1982 xen_map_identity_early(level2_ident_pgt, max_pfn);
1983
1984 /* Make pagetable pieces RO */
1985 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1986 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1987 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1988 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1989 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1990 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1991
1992 /* Pin down new L4 */
1993 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1994 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1995
1996 /* Unpin Xen-provided one */
1997 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1998
1999 /* Switch over */
2000 pgd = init_level4_pgt;
2001
2002 /*
2003 * At this stage there can be no user pgd, and no page
2004 * structure to attach it to, so make sure we just set kernel
2005 * pgd.
2006 */
2007 xen_mc_batch();
2008 __xen_write_cr3(true, __pa(pgd));
2009 xen_mc_issue(PARAVIRT_LAZY_CPU);
2010
2011 reserve_early(__pa(xen_start_info->pt_base),
2012 __pa(xen_start_info->pt_base +
2013 xen_start_info->nr_pt_frames * PAGE_SIZE),
2014 "XEN PAGETABLES");
2015
2016 return pgd;
2017}
2018#else /* !CONFIG_X86_64 */
Jeremy Fitzhardingef0991802010-08-26 16:16:28 -07002019static RESERVE_BRK_ARRAY(pmd_t, level2_kernel_pgt, PTRS_PER_PMD);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002020
2021__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
2022 unsigned long max_pfn)
2023{
2024 pmd_t *kernel_pmd;
2025
Jeremy Fitzhardingef0991802010-08-26 16:16:28 -07002026 level2_kernel_pgt = extend_brk(sizeof(pmd_t *) * PTRS_PER_PMD, PAGE_SIZE);
2027
Jeremy Fitzhardinge93dbda72009-02-26 17:35:44 -08002028 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
2029 xen_start_info->nr_pt_frames * PAGE_SIZE +
2030 512*1024);
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002031
2032 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
2033 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
2034
2035 xen_map_identity_early(level2_kernel_pgt, max_pfn);
2036
2037 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
2038 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
2039 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
2040
2041 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
2042 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
2043 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
2044
2045 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
2046
2047 xen_write_cr3(__pa(swapper_pg_dir));
2048
2049 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
2050
Jeremy Fitzhardinge33df4db2009-05-07 11:56:44 -07002051 reserve_early(__pa(xen_start_info->pt_base),
2052 __pa(xen_start_info->pt_base +
2053 xen_start_info->nr_pt_frames * PAGE_SIZE),
2054 "XEN PAGETABLES");
2055
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002056 return swapper_pg_dir;
2057}
2058#endif /* CONFIG_X86_64 */
2059
Masami Hiramatsu3b3809a2009-04-09 10:55:33 -07002060static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002061{
2062 pte_t pte;
2063
2064 phys >>= PAGE_SHIFT;
2065
2066 switch (idx) {
2067 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
2068#ifdef CONFIG_X86_F00F_BUG
2069 case FIX_F00F_IDT:
2070#endif
2071#ifdef CONFIG_X86_32
2072 case FIX_WP_TEST:
2073 case FIX_VDSO:
2074# ifdef CONFIG_HIGHMEM
2075 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
2076# endif
2077#else
2078 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
2079#endif
2080#ifdef CONFIG_X86_LOCAL_APIC
2081 case FIX_APIC_BASE: /* maps dummy local APIC */
2082#endif
Jeremy Fitzhardinge3ecb1b72009-03-07 23:48:41 -08002083 case FIX_TEXT_POKE0:
2084 case FIX_TEXT_POKE1:
2085 /* All local page mappings */
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002086 pte = pfn_pte(phys, prot);
2087 break;
2088
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -08002089 case FIX_PARAVIRT_BOOTMAP:
2090 /* This is an MFN, but it isn't an IO mapping from the
2091 IO domain */
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002092 pte = mfn_pte(phys, prot);
2093 break;
Jeremy Fitzhardingec0011db2010-02-04 14:46:34 -08002094
2095 default:
2096 /* By default, set_fixmap is used for hardware mappings */
2097 pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
2098 break;
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002099 }
2100
2101 __native_set_fixmap(idx, pte);
2102
2103#ifdef CONFIG_X86_64
2104 /* Replicate changes to map the vsyscall page into the user
2105 pagetable vsyscall mapping. */
2106 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
2107 unsigned long vaddr = __fix_to_virt(idx);
2108 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
2109 }
2110#endif
2111}
2112
Thomas Gleixnerf1d70622009-08-20 13:13:52 +02002113static __init void xen_post_allocator_init(void)
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002114{
2115 pv_mmu_ops.set_pte = xen_set_pte;
2116 pv_mmu_ops.set_pmd = xen_set_pmd;
2117 pv_mmu_ops.set_pud = xen_set_pud;
2118#if PAGETABLE_LEVELS == 4
2119 pv_mmu_ops.set_pgd = xen_set_pgd;
2120#endif
2121
2122 /* This will work as long as patching hasn't happened yet
2123 (which it hasn't) */
2124 pv_mmu_ops.alloc_pte = xen_alloc_pte;
2125 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
2126 pv_mmu_ops.release_pte = xen_release_pte;
2127 pv_mmu_ops.release_pmd = xen_release_pmd;
2128#if PAGETABLE_LEVELS == 4
2129 pv_mmu_ops.alloc_pud = xen_alloc_pud;
2130 pv_mmu_ops.release_pud = xen_release_pud;
2131#endif
2132
2133#ifdef CONFIG_X86_64
2134 SetPagePinned(virt_to_page(level3_user_vsyscall));
2135#endif
2136 xen_mark_init_mm_pinned();
2137}
2138
Jeremy Fitzhardingeb407fc52009-02-17 23:46:21 -08002139static void xen_leave_lazy_mmu(void)
2140{
Jeremy Fitzhardinge5caecb92009-02-20 23:01:26 -08002141 preempt_disable();
Jeremy Fitzhardingeb407fc52009-02-17 23:46:21 -08002142 xen_mc_flush();
2143 paravirt_leave_lazy_mmu();
Jeremy Fitzhardinge5caecb92009-02-20 23:01:26 -08002144 preempt_enable();
Jeremy Fitzhardingeb407fc52009-02-17 23:46:21 -08002145}
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002146
Thomas Gleixner030cb6c2009-08-20 14:30:02 +02002147static const struct pv_mmu_ops xen_mmu_ops __initdata = {
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002148 .read_cr2 = xen_read_cr2,
2149 .write_cr2 = xen_write_cr2,
2150
2151 .read_cr3 = xen_read_cr3,
2152 .write_cr3 = xen_write_cr3,
2153
2154 .flush_tlb_user = xen_flush_tlb,
2155 .flush_tlb_kernel = xen_flush_tlb,
2156 .flush_tlb_single = xen_flush_tlb_single,
2157 .flush_tlb_others = xen_flush_tlb_others,
2158
2159 .pte_update = paravirt_nop,
2160 .pte_update_defer = paravirt_nop,
2161
2162 .pgd_alloc = xen_pgd_alloc,
2163 .pgd_free = xen_pgd_free,
2164
2165 .alloc_pte = xen_alloc_pte_init,
2166 .release_pte = xen_release_pte_init,
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07002167 .alloc_pmd = xen_alloc_pmd_init,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002168 .alloc_pmd_clone = paravirt_nop,
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07002169 .release_pmd = xen_release_pmd_init,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002170
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002171#ifdef CONFIG_X86_64
2172 .set_pte = xen_set_pte,
2173#else
2174 .set_pte = xen_set_pte_init,
2175#endif
2176 .set_pte_at = xen_set_pte_at,
2177 .set_pmd = xen_set_pmd_hyper,
2178
2179 .ptep_modify_prot_start = __ptep_modify_prot_start,
2180 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
2181
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08002182 .pte_val = PV_CALLEE_SAVE(xen_pte_val),
2183 .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002184
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08002185 .make_pte = PV_CALLEE_SAVE(xen_make_pte),
2186 .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002187
2188#ifdef CONFIG_X86_PAE
2189 .set_pte_atomic = xen_set_pte_atomic,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002190 .pte_clear = xen_pte_clear,
2191 .pmd_clear = xen_pmd_clear,
2192#endif /* CONFIG_X86_PAE */
2193 .set_pud = xen_set_pud_hyper,
2194
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08002195 .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
2196 .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002197
2198#if PAGETABLE_LEVELS == 4
Jeremy Fitzhardingeda5de7c2009-01-28 14:35:07 -08002199 .pud_val = PV_CALLEE_SAVE(xen_pud_val),
2200 .make_pud = PV_CALLEE_SAVE(xen_make_pud),
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002201 .set_pgd = xen_set_pgd_hyper,
2202
Jeremy Fitzhardingeb96229b2009-03-17 13:30:55 -07002203 .alloc_pud = xen_alloc_pmd_init,
2204 .release_pud = xen_release_pmd_init,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002205#endif /* PAGETABLE_LEVELS == 4 */
2206
2207 .activate_mm = xen_activate_mm,
2208 .dup_mmap = xen_dup_mmap,
2209 .exit_mmap = xen_exit_mmap,
2210
2211 .lazy_mode = {
2212 .enter = paravirt_enter_lazy_mmu,
Jeremy Fitzhardingeb407fc52009-02-17 23:46:21 -08002213 .leave = xen_leave_lazy_mmu,
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002214 },
2215
2216 .set_fixmap = xen_set_fixmap,
2217};
2218
Thomas Gleixner030cb6c2009-08-20 14:30:02 +02002219void __init xen_init_mmu_ops(void)
2220{
2221 x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
2222 x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
2223 pv_mmu_ops = xen_mmu_ops;
Jeremy Fitzhardinged2cb2142010-03-26 15:37:50 -07002224
2225 vmap_lazy_unmap = false;
Thomas Gleixner030cb6c2009-08-20 14:30:02 +02002226}
Jeremy Fitzhardinge319f3ba2009-01-28 14:35:01 -08002227
Alex Nixon08bbc9d2009-02-09 12:05:46 -08002228/* Protected by xen_reservation_lock. */
2229#define MAX_CONTIG_ORDER 9 /* 2MB */
2230static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
2231
2232#define VOID_PTE (mfn_pte(0, __pgprot(0)))
2233static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2234 unsigned long *in_frames,
2235 unsigned long *out_frames)
2236{
2237 int i;
2238 struct multicall_space mcs;
2239
2240 xen_mc_batch();
2241 for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
2242 mcs = __xen_mc_entry(0);
2243
2244 if (in_frames)
2245 in_frames[i] = virt_to_mfn(vaddr);
2246
2247 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
2248 set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
2249
2250 if (out_frames)
2251 out_frames[i] = virt_to_pfn(vaddr);
2252 }
2253 xen_mc_issue(0);
2254}
2255
2256/*
2257 * Update the pfn-to-mfn mappings for a virtual address range, either to
2258 * point to an array of mfns, or contiguously from a single starting
2259 * mfn.
2260 */
2261static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
2262 unsigned long *mfns,
2263 unsigned long first_mfn)
2264{
2265 unsigned i, limit;
2266 unsigned long mfn;
2267
2268 xen_mc_batch();
2269
2270 limit = 1u << order;
2271 for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
2272 struct multicall_space mcs;
2273 unsigned flags;
2274
2275 mcs = __xen_mc_entry(0);
2276 if (mfns)
2277 mfn = mfns[i];
2278 else
2279 mfn = first_mfn + i;
2280
2281 if (i < (limit - 1))
2282 flags = 0;
2283 else {
2284 if (order == 0)
2285 flags = UVMF_INVLPG | UVMF_ALL;
2286 else
2287 flags = UVMF_TLB_FLUSH | UVMF_ALL;
2288 }
2289
2290 MULTI_update_va_mapping(mcs.mc, vaddr,
2291 mfn_pte(mfn, PAGE_KERNEL), flags);
2292
2293 set_phys_to_machine(virt_to_pfn(vaddr), mfn);
2294 }
2295
2296 xen_mc_issue(0);
2297}
2298
2299/*
2300 * Perform the hypercall to exchange a region of our pfns to point to
2301 * memory with the required contiguous alignment. Takes the pfns as
2302 * input, and populates mfns as output.
2303 *
2304 * Returns a success code indicating whether the hypervisor was able to
2305 * satisfy the request or not.
2306 */
2307static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
2308 unsigned long *pfns_in,
2309 unsigned long extents_out,
2310 unsigned int order_out,
2311 unsigned long *mfns_out,
2312 unsigned int address_bits)
2313{
2314 long rc;
2315 int success;
2316
2317 struct xen_memory_exchange exchange = {
2318 .in = {
2319 .nr_extents = extents_in,
2320 .extent_order = order_in,
2321 .extent_start = pfns_in,
2322 .domid = DOMID_SELF
2323 },
2324 .out = {
2325 .nr_extents = extents_out,
2326 .extent_order = order_out,
2327 .extent_start = mfns_out,
2328 .address_bits = address_bits,
2329 .domid = DOMID_SELF
2330 }
2331 };
2332
2333 BUG_ON(extents_in << order_in != extents_out << order_out);
2334
2335 rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
2336 success = (exchange.nr_exchanged == extents_in);
2337
2338 BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
2339 BUG_ON(success && (rc != 0));
2340
2341 return success;
2342}
2343
2344int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
2345 unsigned int address_bits)
2346{
2347 unsigned long *in_frames = discontig_frames, out_frame;
2348 unsigned long flags;
2349 int success;
2350
2351 /*
2352 * Currently an auto-translated guest will not perform I/O, nor will
2353 * it require PAE page directories below 4GB. Therefore any calls to
2354 * this function are redundant and can be ignored.
2355 */
2356
2357 if (xen_feature(XENFEAT_auto_translated_physmap))
2358 return 0;
2359
2360 if (unlikely(order > MAX_CONTIG_ORDER))
2361 return -ENOMEM;
2362
2363 memset((void *) vstart, 0, PAGE_SIZE << order);
2364
Alex Nixon08bbc9d2009-02-09 12:05:46 -08002365 spin_lock_irqsave(&xen_reservation_lock, flags);
2366
2367 /* 1. Zap current PTEs, remembering MFNs. */
2368 xen_zap_pfn_range(vstart, order, in_frames, NULL);
2369
2370 /* 2. Get a new contiguous memory extent. */
2371 out_frame = virt_to_pfn(vstart);
2372 success = xen_exchange_memory(1UL << order, 0, in_frames,
2373 1, order, &out_frame,
2374 address_bits);
2375
2376 /* 3. Map the new extent in place of old pages. */
2377 if (success)
2378 xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
2379 else
2380 xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
2381
2382 spin_unlock_irqrestore(&xen_reservation_lock, flags);
2383
2384 return success ? 0 : -ENOMEM;
2385}
2386EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
2387
2388void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
2389{
2390 unsigned long *out_frames = discontig_frames, in_frame;
2391 unsigned long flags;
2392 int success;
2393
2394 if (xen_feature(XENFEAT_auto_translated_physmap))
2395 return;
2396
2397 if (unlikely(order > MAX_CONTIG_ORDER))
2398 return;
2399
2400 memset((void *) vstart, 0, PAGE_SIZE << order);
2401
Alex Nixon08bbc9d2009-02-09 12:05:46 -08002402 spin_lock_irqsave(&xen_reservation_lock, flags);
2403
2404 /* 1. Find start MFN of contiguous extent. */
2405 in_frame = virt_to_mfn(vstart);
2406
2407 /* 2. Zap current PTEs. */
2408 xen_zap_pfn_range(vstart, order, NULL, out_frames);
2409
2410 /* 3. Do the exchange for non-contiguous MFNs. */
2411 success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
2412 0, out_frames, 0);
2413
2414 /* 4. Map new pages in place of old pages. */
2415 if (success)
2416 xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
2417 else
2418 xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
2419
2420 spin_unlock_irqrestore(&xen_reservation_lock, flags);
2421}
2422EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
2423
Stefano Stabellinica65f9f2010-07-29 14:37:48 +01002424#ifdef CONFIG_XEN_PVHVM
Stefano Stabellini59151002010-06-17 14:22:52 +01002425static void xen_hvm_exit_mmap(struct mm_struct *mm)
2426{
2427 struct xen_hvm_pagetable_dying a;
2428 int rc;
2429
2430 a.domid = DOMID_SELF;
2431 a.gpa = __pa(mm->pgd);
2432 rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2433 WARN_ON_ONCE(rc < 0);
2434}
2435
2436static int is_pagetable_dying_supported(void)
2437{
2438 struct xen_hvm_pagetable_dying a;
2439 int rc = 0;
2440
2441 a.domid = DOMID_SELF;
2442 a.gpa = 0x00;
2443 rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2444 if (rc < 0) {
2445 printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
2446 return 0;
2447 }
2448 return 1;
2449}
2450
2451void __init xen_hvm_init_mmu_ops(void)
2452{
2453 if (is_pagetable_dying_supported())
2454 pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
2455}
Stefano Stabellinica65f9f2010-07-29 14:37:48 +01002456#endif
Stefano Stabellini59151002010-06-17 14:22:52 +01002457
Jeremy Fitzhardinge994025c2008-08-20 17:02:19 -07002458#ifdef CONFIG_XEN_DEBUG_FS
2459
2460static struct dentry *d_mmu_debug;
2461
2462static int __init xen_mmu_debugfs(void)
2463{
2464 struct dentry *d_xen = xen_init_debugfs();
2465
2466 if (d_xen == NULL)
2467 return -ENOMEM;
2468
2469 d_mmu_debug = debugfs_create_dir("mmu", d_xen);
2470
2471 debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
2472
2473 debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
2474 debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
2475 &mmu_stats.pgd_update_pinned);
2476 debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
2477 &mmu_stats.pgd_update_pinned);
2478
2479 debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
2480 debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
2481 &mmu_stats.pud_update_pinned);
2482 debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
2483 &mmu_stats.pud_update_pinned);
2484
2485 debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
2486 debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
2487 &mmu_stats.pmd_update_pinned);
2488 debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
2489 &mmu_stats.pmd_update_pinned);
2490
2491 debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
2492// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
2493// &mmu_stats.pte_update_pinned);
2494 debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
2495 &mmu_stats.pte_update_pinned);
2496
2497 debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
2498 debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
2499 &mmu_stats.mmu_update_extended);
2500 xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
2501 mmu_stats.mmu_update_histo, 20);
2502
2503 debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
2504 debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
2505 &mmu_stats.set_pte_at_batched);
2506 debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
2507 &mmu_stats.set_pte_at_current);
2508 debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
2509 &mmu_stats.set_pte_at_kernel);
2510
2511 debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
2512 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
2513 &mmu_stats.prot_commit_batched);
2514
2515 return 0;
2516}
2517fs_initcall(xen_mmu_debugfs);
2518
2519#endif /* CONFIG_XEN_DEBUG_FS */