|  | /* | 
|  | *    Copyright IBM Corp. 2007,2009 | 
|  | *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> | 
|  | */ | 
|  |  | 
|  | #include <linux/sched.h> | 
|  | #include <linux/kernel.h> | 
|  | #include <linux/errno.h> | 
|  | #include <linux/gfp.h> | 
|  | #include <linux/mm.h> | 
|  | #include <linux/swap.h> | 
|  | #include <linux/smp.h> | 
|  | #include <linux/highmem.h> | 
|  | #include <linux/pagemap.h> | 
|  | #include <linux/spinlock.h> | 
|  | #include <linux/module.h> | 
|  | #include <linux/quicklist.h> | 
|  | #include <linux/rcupdate.h> | 
|  |  | 
|  | #include <asm/system.h> | 
|  | #include <asm/pgtable.h> | 
|  | #include <asm/pgalloc.h> | 
|  | #include <asm/tlb.h> | 
|  | #include <asm/tlbflush.h> | 
|  | #include <asm/mmu_context.h> | 
|  |  | 
|  | struct rcu_table_freelist { | 
|  | struct rcu_head rcu; | 
|  | struct mm_struct *mm; | 
|  | unsigned int pgt_index; | 
|  | unsigned int crst_index; | 
|  | unsigned long *table[0]; | 
|  | }; | 
|  |  | 
|  | #define RCU_FREELIST_SIZE \ | 
|  | ((PAGE_SIZE - sizeof(struct rcu_table_freelist)) \ | 
|  | / sizeof(unsigned long)) | 
|  |  | 
|  | static DEFINE_PER_CPU(struct rcu_table_freelist *, rcu_table_freelist); | 
|  |  | 
|  | static void __page_table_free(struct mm_struct *mm, unsigned long *table); | 
|  |  | 
|  | static struct rcu_table_freelist *rcu_table_freelist_get(struct mm_struct *mm) | 
|  | { | 
|  | struct rcu_table_freelist **batchp = &__get_cpu_var(rcu_table_freelist); | 
|  | struct rcu_table_freelist *batch = *batchp; | 
|  |  | 
|  | if (batch) | 
|  | return batch; | 
|  | batch = (struct rcu_table_freelist *) __get_free_page(GFP_ATOMIC); | 
|  | if (batch) { | 
|  | batch->mm = mm; | 
|  | batch->pgt_index = 0; | 
|  | batch->crst_index = RCU_FREELIST_SIZE; | 
|  | *batchp = batch; | 
|  | } | 
|  | return batch; | 
|  | } | 
|  |  | 
|  | static void rcu_table_freelist_callback(struct rcu_head *head) | 
|  | { | 
|  | struct rcu_table_freelist *batch = | 
|  | container_of(head, struct rcu_table_freelist, rcu); | 
|  |  | 
|  | while (batch->pgt_index > 0) | 
|  | __page_table_free(batch->mm, batch->table[--batch->pgt_index]); | 
|  | while (batch->crst_index < RCU_FREELIST_SIZE) | 
|  | crst_table_free(batch->mm, batch->table[batch->crst_index++]); | 
|  | free_page((unsigned long) batch); | 
|  | } | 
|  |  | 
|  | void rcu_table_freelist_finish(void) | 
|  | { | 
|  | struct rcu_table_freelist **batchp = &get_cpu_var(rcu_table_freelist); | 
|  | struct rcu_table_freelist *batch = *batchp; | 
|  |  | 
|  | if (!batch) | 
|  | goto out; | 
|  | call_rcu(&batch->rcu, rcu_table_freelist_callback); | 
|  | *batchp = NULL; | 
|  | out: | 
|  | put_cpu_var(rcu_table_freelist); | 
|  | } | 
|  |  | 
|  | static void smp_sync(void *arg) | 
|  | { | 
|  | } | 
|  |  | 
|  | #ifndef CONFIG_64BIT | 
|  | #define ALLOC_ORDER	1 | 
|  | #define TABLES_PER_PAGE	4 | 
|  | #define FRAG_MASK	15UL | 
|  | #define SECOND_HALVES	10UL | 
|  |  | 
|  | void clear_table_pgstes(unsigned long *table) | 
|  | { | 
|  | clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/4); | 
|  | memset(table + 256, 0, PAGE_SIZE/4); | 
|  | clear_table(table + 512, _PAGE_TYPE_EMPTY, PAGE_SIZE/4); | 
|  | memset(table + 768, 0, PAGE_SIZE/4); | 
|  | } | 
|  |  | 
|  | #else | 
|  | #define ALLOC_ORDER	2 | 
|  | #define TABLES_PER_PAGE	2 | 
|  | #define FRAG_MASK	3UL | 
|  | #define SECOND_HALVES	2UL | 
|  |  | 
|  | void clear_table_pgstes(unsigned long *table) | 
|  | { | 
|  | clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2); | 
|  | memset(table + 256, 0, PAGE_SIZE/2); | 
|  | } | 
|  |  | 
|  | #endif | 
|  |  | 
|  | unsigned long VMALLOC_START = VMALLOC_END - VMALLOC_SIZE; | 
|  | EXPORT_SYMBOL(VMALLOC_START); | 
|  |  | 
|  | static int __init parse_vmalloc(char *arg) | 
|  | { | 
|  | if (!arg) | 
|  | return -EINVAL; | 
|  | VMALLOC_START = (VMALLOC_END - memparse(arg, &arg)) & PAGE_MASK; | 
|  | return 0; | 
|  | } | 
|  | early_param("vmalloc", parse_vmalloc); | 
|  |  | 
|  | unsigned long *crst_table_alloc(struct mm_struct *mm) | 
|  | { | 
|  | struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); | 
|  |  | 
|  | if (!page) | 
|  | return NULL; | 
|  | return (unsigned long *) page_to_phys(page); | 
|  | } | 
|  |  | 
|  | void crst_table_free(struct mm_struct *mm, unsigned long *table) | 
|  | { | 
|  | free_pages((unsigned long) table, ALLOC_ORDER); | 
|  | } | 
|  |  | 
|  | void crst_table_free_rcu(struct mm_struct *mm, unsigned long *table) | 
|  | { | 
|  | struct rcu_table_freelist *batch; | 
|  |  | 
|  | preempt_disable(); | 
|  | if (atomic_read(&mm->mm_users) < 2 && | 
|  | cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) { | 
|  | crst_table_free(mm, table); | 
|  | goto out; | 
|  | } | 
|  | batch = rcu_table_freelist_get(mm); | 
|  | if (!batch) { | 
|  | smp_call_function(smp_sync, NULL, 1); | 
|  | crst_table_free(mm, table); | 
|  | goto out; | 
|  | } | 
|  | batch->table[--batch->crst_index] = table; | 
|  | if (batch->pgt_index >= batch->crst_index) | 
|  | rcu_table_freelist_finish(); | 
|  | out: | 
|  | preempt_enable(); | 
|  | } | 
|  |  | 
|  | #ifdef CONFIG_64BIT | 
|  | int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) | 
|  | { | 
|  | unsigned long *table, *pgd; | 
|  | unsigned long entry; | 
|  |  | 
|  | BUG_ON(limit > (1UL << 53)); | 
|  | repeat: | 
|  | table = crst_table_alloc(mm); | 
|  | if (!table) | 
|  | return -ENOMEM; | 
|  | spin_lock_bh(&mm->page_table_lock); | 
|  | if (mm->context.asce_limit < limit) { | 
|  | pgd = (unsigned long *) mm->pgd; | 
|  | if (mm->context.asce_limit <= (1UL << 31)) { | 
|  | entry = _REGION3_ENTRY_EMPTY; | 
|  | mm->context.asce_limit = 1UL << 42; | 
|  | mm->context.asce_bits = _ASCE_TABLE_LENGTH | | 
|  | _ASCE_USER_BITS | | 
|  | _ASCE_TYPE_REGION3; | 
|  | } else { | 
|  | entry = _REGION2_ENTRY_EMPTY; | 
|  | mm->context.asce_limit = 1UL << 53; | 
|  | mm->context.asce_bits = _ASCE_TABLE_LENGTH | | 
|  | _ASCE_USER_BITS | | 
|  | _ASCE_TYPE_REGION2; | 
|  | } | 
|  | crst_table_init(table, entry); | 
|  | pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd); | 
|  | mm->pgd = (pgd_t *) table; | 
|  | mm->task_size = mm->context.asce_limit; | 
|  | table = NULL; | 
|  | } | 
|  | spin_unlock_bh(&mm->page_table_lock); | 
|  | if (table) | 
|  | crst_table_free(mm, table); | 
|  | if (mm->context.asce_limit < limit) | 
|  | goto repeat; | 
|  | update_mm(mm, current); | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) | 
|  | { | 
|  | pgd_t *pgd; | 
|  |  | 
|  | if (mm->context.asce_limit <= limit) | 
|  | return; | 
|  | __tlb_flush_mm(mm); | 
|  | while (mm->context.asce_limit > limit) { | 
|  | pgd = mm->pgd; | 
|  | switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { | 
|  | case _REGION_ENTRY_TYPE_R2: | 
|  | mm->context.asce_limit = 1UL << 42; | 
|  | mm->context.asce_bits = _ASCE_TABLE_LENGTH | | 
|  | _ASCE_USER_BITS | | 
|  | _ASCE_TYPE_REGION3; | 
|  | break; | 
|  | case _REGION_ENTRY_TYPE_R3: | 
|  | mm->context.asce_limit = 1UL << 31; | 
|  | mm->context.asce_bits = _ASCE_TABLE_LENGTH | | 
|  | _ASCE_USER_BITS | | 
|  | _ASCE_TYPE_SEGMENT; | 
|  | break; | 
|  | default: | 
|  | BUG(); | 
|  | } | 
|  | mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); | 
|  | mm->task_size = mm->context.asce_limit; | 
|  | crst_table_free(mm, (unsigned long *) pgd); | 
|  | } | 
|  | update_mm(mm, current); | 
|  | } | 
|  | #endif | 
|  |  | 
|  | /* | 
|  | * page table entry allocation/free routines. | 
|  | */ | 
|  | unsigned long *page_table_alloc(struct mm_struct *mm) | 
|  | { | 
|  | struct page *page; | 
|  | unsigned long *table; | 
|  | unsigned long bits; | 
|  |  | 
|  | bits = (mm->context.has_pgste) ? 3UL : 1UL; | 
|  | spin_lock_bh(&mm->context.list_lock); | 
|  | page = NULL; | 
|  | if (!list_empty(&mm->context.pgtable_list)) { | 
|  | page = list_first_entry(&mm->context.pgtable_list, | 
|  | struct page, lru); | 
|  | if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1)) | 
|  | page = NULL; | 
|  | } | 
|  | if (!page) { | 
|  | spin_unlock_bh(&mm->context.list_lock); | 
|  | page = alloc_page(GFP_KERNEL|__GFP_REPEAT); | 
|  | if (!page) | 
|  | return NULL; | 
|  | pgtable_page_ctor(page); | 
|  | page->flags &= ~FRAG_MASK; | 
|  | table = (unsigned long *) page_to_phys(page); | 
|  | if (mm->context.has_pgste) | 
|  | clear_table_pgstes(table); | 
|  | else | 
|  | clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE); | 
|  | spin_lock_bh(&mm->context.list_lock); | 
|  | list_add(&page->lru, &mm->context.pgtable_list); | 
|  | } | 
|  | table = (unsigned long *) page_to_phys(page); | 
|  | while (page->flags & bits) { | 
|  | table += 256; | 
|  | bits <<= 1; | 
|  | } | 
|  | page->flags |= bits; | 
|  | if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1)) | 
|  | list_move_tail(&page->lru, &mm->context.pgtable_list); | 
|  | spin_unlock_bh(&mm->context.list_lock); | 
|  | return table; | 
|  | } | 
|  |  | 
|  | static void __page_table_free(struct mm_struct *mm, unsigned long *table) | 
|  | { | 
|  | struct page *page; | 
|  | unsigned long bits; | 
|  |  | 
|  | bits = ((unsigned long) table) & 15; | 
|  | table = (unsigned long *)(((unsigned long) table) ^ bits); | 
|  | page = pfn_to_page(__pa(table) >> PAGE_SHIFT); | 
|  | page->flags ^= bits; | 
|  | if (!(page->flags & FRAG_MASK)) { | 
|  | pgtable_page_dtor(page); | 
|  | __free_page(page); | 
|  | } | 
|  | } | 
|  |  | 
|  | void page_table_free(struct mm_struct *mm, unsigned long *table) | 
|  | { | 
|  | struct page *page; | 
|  | unsigned long bits; | 
|  |  | 
|  | bits = (mm->context.has_pgste) ? 3UL : 1UL; | 
|  | bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long); | 
|  | page = pfn_to_page(__pa(table) >> PAGE_SHIFT); | 
|  | spin_lock_bh(&mm->context.list_lock); | 
|  | page->flags ^= bits; | 
|  | if (page->flags & FRAG_MASK) { | 
|  | /* Page now has some free pgtable fragments. */ | 
|  | if (!list_empty(&page->lru)) | 
|  | list_move(&page->lru, &mm->context.pgtable_list); | 
|  | page = NULL; | 
|  | } else | 
|  | /* All fragments of the 4K page have been freed. */ | 
|  | list_del(&page->lru); | 
|  | spin_unlock_bh(&mm->context.list_lock); | 
|  | if (page) { | 
|  | pgtable_page_dtor(page); | 
|  | __free_page(page); | 
|  | } | 
|  | } | 
|  |  | 
|  | void page_table_free_rcu(struct mm_struct *mm, unsigned long *table) | 
|  | { | 
|  | struct rcu_table_freelist *batch; | 
|  | struct page *page; | 
|  | unsigned long bits; | 
|  |  | 
|  | preempt_disable(); | 
|  | if (atomic_read(&mm->mm_users) < 2 && | 
|  | cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) { | 
|  | page_table_free(mm, table); | 
|  | goto out; | 
|  | } | 
|  | batch = rcu_table_freelist_get(mm); | 
|  | if (!batch) { | 
|  | smp_call_function(smp_sync, NULL, 1); | 
|  | page_table_free(mm, table); | 
|  | goto out; | 
|  | } | 
|  | bits = (mm->context.has_pgste) ? 3UL : 1UL; | 
|  | bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long); | 
|  | page = pfn_to_page(__pa(table) >> PAGE_SHIFT); | 
|  | spin_lock_bh(&mm->context.list_lock); | 
|  | /* Delayed freeing with rcu prevents reuse of pgtable fragments */ | 
|  | list_del_init(&page->lru); | 
|  | spin_unlock_bh(&mm->context.list_lock); | 
|  | table = (unsigned long *)(((unsigned long) table) | bits); | 
|  | batch->table[batch->pgt_index++] = table; | 
|  | if (batch->pgt_index >= batch->crst_index) | 
|  | rcu_table_freelist_finish(); | 
|  | out: | 
|  | preempt_enable(); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * switch on pgstes for its userspace process (for kvm) | 
|  | */ | 
|  | int s390_enable_sie(void) | 
|  | { | 
|  | struct task_struct *tsk = current; | 
|  | struct mm_struct *mm, *old_mm; | 
|  |  | 
|  | /* Do we have switched amode? If no, we cannot do sie */ | 
|  | if (user_mode == HOME_SPACE_MODE) | 
|  | return -EINVAL; | 
|  |  | 
|  | /* Do we have pgstes? if yes, we are done */ | 
|  | if (tsk->mm->context.has_pgste) | 
|  | return 0; | 
|  |  | 
|  | /* lets check if we are allowed to replace the mm */ | 
|  | task_lock(tsk); | 
|  | if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || | 
|  | #ifdef CONFIG_AIO | 
|  | !hlist_empty(&tsk->mm->ioctx_list) || | 
|  | #endif | 
|  | tsk->mm != tsk->active_mm) { | 
|  | task_unlock(tsk); | 
|  | return -EINVAL; | 
|  | } | 
|  | task_unlock(tsk); | 
|  |  | 
|  | /* we copy the mm and let dup_mm create the page tables with_pgstes */ | 
|  | tsk->mm->context.alloc_pgste = 1; | 
|  | mm = dup_mm(tsk); | 
|  | tsk->mm->context.alloc_pgste = 0; | 
|  | if (!mm) | 
|  | return -ENOMEM; | 
|  |  | 
|  | /* Now lets check again if something happened */ | 
|  | task_lock(tsk); | 
|  | if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || | 
|  | #ifdef CONFIG_AIO | 
|  | !hlist_empty(&tsk->mm->ioctx_list) || | 
|  | #endif | 
|  | tsk->mm != tsk->active_mm) { | 
|  | mmput(mm); | 
|  | task_unlock(tsk); | 
|  | return -EINVAL; | 
|  | } | 
|  |  | 
|  | /* ok, we are alone. No ptrace, no threads, etc. */ | 
|  | old_mm = tsk->mm; | 
|  | tsk->mm = tsk->active_mm = mm; | 
|  | preempt_disable(); | 
|  | update_mm(mm, tsk); | 
|  | atomic_inc(&mm->context.attach_count); | 
|  | atomic_dec(&old_mm->context.attach_count); | 
|  | cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm)); | 
|  | preempt_enable(); | 
|  | task_unlock(tsk); | 
|  | mmput(old_mm); | 
|  | return 0; | 
|  | } | 
|  | EXPORT_SYMBOL_GPL(s390_enable_sie); | 
|  |  | 
|  | #if defined(CONFIG_DEBUG_PAGEALLOC) && defined(CONFIG_HIBERNATION) | 
|  | bool kernel_page_present(struct page *page) | 
|  | { | 
|  | unsigned long addr; | 
|  | int cc; | 
|  |  | 
|  | addr = page_to_phys(page); | 
|  | asm volatile( | 
|  | "	lra	%1,0(%1)\n" | 
|  | "	ipm	%0\n" | 
|  | "	srl	%0,28" | 
|  | : "=d" (cc), "+a" (addr) : : "cc"); | 
|  | return cc == 0; | 
|  | } | 
|  | #endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */ |