|  | /* | 
|  | *  linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit | 
|  | * | 
|  | *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE | 
|  | *  Copyright (C) 2000 Pavel Machek <pavel@suse.cz> | 
|  | *  Copyright (C) 2000 Karsten Keil <kkeil@suse.de> | 
|  | *  Copyright (C) 2001,2002 Andi Kleen <ak@suse.de> | 
|  | *  Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com> | 
|  | */ | 
|  |  | 
|  |  | 
|  | #include <linux/linkage.h> | 
|  | #include <linux/threads.h> | 
|  | #include <linux/init.h> | 
|  | #include <asm/desc.h> | 
|  | #include <asm/segment.h> | 
|  | #include <asm/pgtable.h> | 
|  | #include <asm/page.h> | 
|  | #include <asm/msr.h> | 
|  | #include <asm/cache.h> | 
|  |  | 
|  | #ifdef CONFIG_PARAVIRT | 
|  | #include <asm/asm-offsets.h> | 
|  | #include <asm/paravirt.h> | 
|  | #else | 
|  | #define GET_CR2_INTO_RCX movq %cr2, %rcx | 
|  | #endif | 
|  |  | 
|  | /* we are not able to switch in one step to the final KERNEL ADRESS SPACE | 
|  | * because we need identity-mapped pages. | 
|  | * | 
|  | */ | 
|  |  | 
|  | .text | 
|  | .section .text.head | 
|  | .code64 | 
|  | .globl startup_64 | 
|  | startup_64: | 
|  |  | 
|  | /* | 
|  | * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, | 
|  | * and someone has loaded an identity mapped page table | 
|  | * for us.  These identity mapped page tables map all of the | 
|  | * kernel pages and possibly all of memory. | 
|  | * | 
|  | * %esi holds a physical pointer to real_mode_data. | 
|  | * | 
|  | * We come here either directly from a 64bit bootloader, or from | 
|  | * arch/x86_64/boot/compressed/head.S. | 
|  | * | 
|  | * We only come here initially at boot nothing else comes here. | 
|  | * | 
|  | * Since we may be loaded at an address different from what we were | 
|  | * compiled to run at we first fixup the physical addresses in our page | 
|  | * tables and then reload them. | 
|  | */ | 
|  |  | 
|  | /* Compute the delta between the address I am compiled to run at and the | 
|  | * address I am actually running at. | 
|  | */ | 
|  | leaq	_text(%rip), %rbp | 
|  | subq	$_text - __START_KERNEL_map, %rbp | 
|  |  | 
|  | /* Is the address not 2M aligned? */ | 
|  | movq	%rbp, %rax | 
|  | andl	$~PMD_PAGE_MASK, %eax | 
|  | testl	%eax, %eax | 
|  | jnz	bad_address | 
|  |  | 
|  | /* Is the address too large? */ | 
|  | leaq	_text(%rip), %rdx | 
|  | movq	$PGDIR_SIZE, %rax | 
|  | cmpq	%rax, %rdx | 
|  | jae	bad_address | 
|  |  | 
|  | /* Fixup the physical addresses in the page table | 
|  | */ | 
|  | addq	%rbp, init_level4_pgt + 0(%rip) | 
|  | addq	%rbp, init_level4_pgt + (258*8)(%rip) | 
|  | addq	%rbp, init_level4_pgt + (511*8)(%rip) | 
|  |  | 
|  | addq	%rbp, level3_ident_pgt + 0(%rip) | 
|  |  | 
|  | addq	%rbp, level3_kernel_pgt + (510*8)(%rip) | 
|  | addq	%rbp, level3_kernel_pgt + (511*8)(%rip) | 
|  |  | 
|  | addq	%rbp, level2_fixmap_pgt + (506*8)(%rip) | 
|  |  | 
|  | /* Add an Identity mapping if I am above 1G */ | 
|  | leaq	_text(%rip), %rdi | 
|  | andq	$PMD_PAGE_MASK, %rdi | 
|  |  | 
|  | movq	%rdi, %rax | 
|  | shrq	$PUD_SHIFT, %rax | 
|  | andq	$(PTRS_PER_PUD - 1), %rax | 
|  | jz	ident_complete | 
|  |  | 
|  | leaq	(level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx | 
|  | leaq	level3_ident_pgt(%rip), %rbx | 
|  | movq	%rdx, 0(%rbx, %rax, 8) | 
|  |  | 
|  | movq	%rdi, %rax | 
|  | shrq	$PMD_SHIFT, %rax | 
|  | andq	$(PTRS_PER_PMD - 1), %rax | 
|  | leaq	__PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx | 
|  | leaq	level2_spare_pgt(%rip), %rbx | 
|  | movq	%rdx, 0(%rbx, %rax, 8) | 
|  | ident_complete: | 
|  |  | 
|  | /* | 
|  | * Fixup the kernel text+data virtual addresses. Note that | 
|  | * we might write invalid pmds, when the kernel is relocated | 
|  | * cleanup_highmap() fixes this up along with the mappings | 
|  | * beyond _end. | 
|  | */ | 
|  |  | 
|  | leaq	level2_kernel_pgt(%rip), %rdi | 
|  | leaq	4096(%rdi), %r8 | 
|  | /* See if it is a valid page table entry */ | 
|  | 1:	testq	$1, 0(%rdi) | 
|  | jz	2f | 
|  | addq	%rbp, 0(%rdi) | 
|  | /* Go to the next page */ | 
|  | 2:	addq	$8, %rdi | 
|  | cmp	%r8, %rdi | 
|  | jne	1b | 
|  |  | 
|  | /* Fixup phys_base */ | 
|  | addq	%rbp, phys_base(%rip) | 
|  |  | 
|  | #ifdef CONFIG_SMP | 
|  | addq	%rbp, trampoline_level4_pgt + 0(%rip) | 
|  | addq	%rbp, trampoline_level4_pgt + (511*8)(%rip) | 
|  | #endif | 
|  |  | 
|  | /* Due to ENTRY(), sometimes the empty space gets filled with | 
|  | * zeros. Better take a jmp than relying on empty space being | 
|  | * filled with 0x90 (nop) | 
|  | */ | 
|  | jmp secondary_startup_64 | 
|  | ENTRY(secondary_startup_64) | 
|  | /* | 
|  | * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, | 
|  | * and someone has loaded a mapped page table. | 
|  | * | 
|  | * %esi holds a physical pointer to real_mode_data. | 
|  | * | 
|  | * We come here either from startup_64 (using physical addresses) | 
|  | * or from trampoline.S (using virtual addresses). | 
|  | * | 
|  | * Using virtual addresses from trampoline.S removes the need | 
|  | * to have any identity mapped pages in the kernel page table | 
|  | * after the boot processor executes this code. | 
|  | */ | 
|  |  | 
|  | /* Enable PAE mode and PGE */ | 
|  | xorq	%rax, %rax | 
|  | btsq	$5, %rax | 
|  | btsq	$7, %rax | 
|  | movq	%rax, %cr4 | 
|  |  | 
|  | /* Setup early boot stage 4 level pagetables. */ | 
|  | movq	$(init_level4_pgt - __START_KERNEL_map), %rax | 
|  | addq	phys_base(%rip), %rax | 
|  | movq	%rax, %cr3 | 
|  |  | 
|  | /* Ensure I am executing from virtual addresses */ | 
|  | movq	$1f, %rax | 
|  | jmp	*%rax | 
|  | 1: | 
|  |  | 
|  | /* Check if nx is implemented */ | 
|  | movl	$0x80000001, %eax | 
|  | cpuid | 
|  | movl	%edx,%edi | 
|  |  | 
|  | /* Setup EFER (Extended Feature Enable Register) */ | 
|  | movl	$MSR_EFER, %ecx | 
|  | rdmsr | 
|  | btsl	$_EFER_SCE, %eax	/* Enable System Call */ | 
|  | btl	$20,%edi		/* No Execute supported? */ | 
|  | jnc     1f | 
|  | btsl	$_EFER_NX, %eax | 
|  | 1:	wrmsr				/* Make changes effective */ | 
|  |  | 
|  | /* Setup cr0 */ | 
|  | #define CR0_PM				1		/* protected mode */ | 
|  | #define CR0_MP				(1<<1) | 
|  | #define CR0_ET				(1<<4) | 
|  | #define CR0_NE				(1<<5) | 
|  | #define CR0_WP				(1<<16) | 
|  | #define CR0_AM				(1<<18) | 
|  | #define CR0_PAGING 			(1<<31) | 
|  | movl $CR0_PM|CR0_MP|CR0_ET|CR0_NE|CR0_WP|CR0_AM|CR0_PAGING,%eax | 
|  | /* Make changes effective */ | 
|  | movq	%rax, %cr0 | 
|  |  | 
|  | /* Setup a boot time stack */ | 
|  | movq init_rsp(%rip),%rsp | 
|  |  | 
|  | /* zero EFLAGS after setting rsp */ | 
|  | pushq $0 | 
|  | popfq | 
|  |  | 
|  | /* | 
|  | * We must switch to a new descriptor in kernel space for the GDT | 
|  | * because soon the kernel won't have access anymore to the userspace | 
|  | * addresses where we're currently running on. We have to do that here | 
|  | * because in 32bit we couldn't load a 64bit linear address. | 
|  | */ | 
|  | lgdt	cpu_gdt_descr(%rip) | 
|  |  | 
|  | /* set up data segments. actually 0 would do too */ | 
|  | movl $__KERNEL_DS,%eax | 
|  | movl %eax,%ds | 
|  | movl %eax,%ss | 
|  | movl %eax,%es | 
|  |  | 
|  | /* | 
|  | * We don't really need to load %fs or %gs, but load them anyway | 
|  | * to kill any stale realmode selectors.  This allows execution | 
|  | * under VT hardware. | 
|  | */ | 
|  | movl %eax,%fs | 
|  | movl %eax,%gs | 
|  |  | 
|  | /* | 
|  | * Setup up a dummy PDA. this is just for some early bootup code | 
|  | * that does in_interrupt() | 
|  | */ | 
|  | movl	$MSR_GS_BASE,%ecx | 
|  | movq	$empty_zero_page,%rax | 
|  | movq    %rax,%rdx | 
|  | shrq	$32,%rdx | 
|  | wrmsr | 
|  |  | 
|  | /* esi is pointer to real mode structure with interesting info. | 
|  | pass it to C */ | 
|  | movl	%esi, %edi | 
|  |  | 
|  | /* Finally jump to run C code and to be on real kernel address | 
|  | * Since we are running on identity-mapped space we have to jump | 
|  | * to the full 64bit address, this is only possible as indirect | 
|  | * jump.  In addition we need to ensure %cs is set so we make this | 
|  | * a far return. | 
|  | */ | 
|  | movq	initial_code(%rip),%rax | 
|  | pushq	$0		# fake return address to stop unwinder | 
|  | pushq	$__KERNEL_CS	# set correct cs | 
|  | pushq	%rax		# target address in negative space | 
|  | lretq | 
|  |  | 
|  | /* SMP bootup changes these two */ | 
|  | __REFDATA | 
|  | .align	8 | 
|  | ENTRY(initial_code) | 
|  | .quad	x86_64_start_kernel | 
|  | __FINITDATA | 
|  |  | 
|  | ENTRY(init_rsp) | 
|  | .quad  init_thread_union+THREAD_SIZE-8 | 
|  |  | 
|  | bad_address: | 
|  | jmp bad_address | 
|  |  | 
|  | .section ".init.text","ax" | 
|  | #ifdef CONFIG_EARLY_PRINTK | 
|  | .globl early_idt_handlers | 
|  | early_idt_handlers: | 
|  | i = 0 | 
|  | .rept NUM_EXCEPTION_VECTORS | 
|  | movl $i, %esi | 
|  | jmp early_idt_handler | 
|  | i = i + 1 | 
|  | .endr | 
|  | #endif | 
|  |  | 
|  | ENTRY(early_idt_handler) | 
|  | #ifdef CONFIG_EARLY_PRINTK | 
|  | cmpl $2,early_recursion_flag(%rip) | 
|  | jz  1f | 
|  | incl early_recursion_flag(%rip) | 
|  | GET_CR2_INTO_RCX | 
|  | movq %rcx,%r9 | 
|  | xorl %r8d,%r8d		# zero for error code | 
|  | movl %esi,%ecx		# get vector number | 
|  | # Test %ecx against mask of vectors that push error code. | 
|  | cmpl $31,%ecx | 
|  | ja 0f | 
|  | movl $1,%eax | 
|  | salq %cl,%rax | 
|  | testl $0x27d00,%eax | 
|  | je 0f | 
|  | popq %r8		# get error code | 
|  | 0:	movq 0(%rsp),%rcx	# get ip | 
|  | movq 8(%rsp),%rdx	# get cs | 
|  | xorl %eax,%eax | 
|  | leaq early_idt_msg(%rip),%rdi | 
|  | call early_printk | 
|  | cmpl $2,early_recursion_flag(%rip) | 
|  | jz  1f | 
|  | call dump_stack | 
|  | #ifdef CONFIG_KALLSYMS | 
|  | leaq early_idt_ripmsg(%rip),%rdi | 
|  | movq 8(%rsp),%rsi	# get rip again | 
|  | call __print_symbol | 
|  | #endif | 
|  | #endif /* EARLY_PRINTK */ | 
|  | 1:	hlt | 
|  | jmp 1b | 
|  |  | 
|  | #ifdef CONFIG_EARLY_PRINTK | 
|  | early_recursion_flag: | 
|  | .long 0 | 
|  |  | 
|  | early_idt_msg: | 
|  | .asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n" | 
|  | early_idt_ripmsg: | 
|  | .asciz "RIP %s\n" | 
|  | #endif /* CONFIG_EARLY_PRINTK */ | 
|  | .previous | 
|  |  | 
|  | .balign PAGE_SIZE | 
|  |  | 
|  | #define NEXT_PAGE(name) \ | 
|  | .balign	PAGE_SIZE; \ | 
|  | ENTRY(name) | 
|  |  | 
|  | /* Automate the creation of 1 to 1 mapping pmd entries */ | 
|  | #define PMDS(START, PERM, COUNT)		\ | 
|  | i = 0 ;					\ | 
|  | .rept (COUNT) ;				\ | 
|  | .quad	(START) + (i << 21) + (PERM) ;	\ | 
|  | i = i + 1 ;				\ | 
|  | .endr | 
|  |  | 
|  | /* | 
|  | * This default setting generates an ident mapping at address 0x100000 | 
|  | * and a mapping for the kernel that precisely maps virtual address | 
|  | * 0xffffffff80000000 to physical address 0x000000. (always using | 
|  | * 2Mbyte large pages provided by PAE mode) | 
|  | */ | 
|  | NEXT_PAGE(init_level4_pgt) | 
|  | .quad	level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE | 
|  | .fill	257,8,0 | 
|  | .quad	level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE | 
|  | .fill	252,8,0 | 
|  | /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ | 
|  | .quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE | 
|  |  | 
|  | NEXT_PAGE(level3_ident_pgt) | 
|  | .quad	level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE | 
|  | .fill	511,8,0 | 
|  |  | 
|  | NEXT_PAGE(level3_kernel_pgt) | 
|  | .fill	510,8,0 | 
|  | /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ | 
|  | .quad	level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE | 
|  | .quad	level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE | 
|  |  | 
|  | NEXT_PAGE(level2_fixmap_pgt) | 
|  | .fill	506,8,0 | 
|  | .quad	level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE | 
|  | /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ | 
|  | .fill	5,8,0 | 
|  |  | 
|  | NEXT_PAGE(level1_fixmap_pgt) | 
|  | .fill	512,8,0 | 
|  |  | 
|  | NEXT_PAGE(level2_ident_pgt) | 
|  | /* Since I easily can, map the first 1G. | 
|  | * Don't set NX because code runs from these pages. | 
|  | */ | 
|  | PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) | 
|  |  | 
|  | NEXT_PAGE(level2_kernel_pgt) | 
|  | /* | 
|  | * 512 MB kernel mapping. We spend a full page on this pagetable | 
|  | * anyway. | 
|  | * | 
|  | * The kernel code+data+bss must not be bigger than that. | 
|  | * | 
|  | * (NOTE: at +512MB starts the module area, see MODULES_VADDR. | 
|  | *  If you want to increase this then increase MODULES_VADDR | 
|  | *  too.) | 
|  | */ | 
|  | PMDS(0, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, | 
|  | KERNEL_IMAGE_SIZE/PMD_SIZE) | 
|  |  | 
|  | NEXT_PAGE(level2_spare_pgt) | 
|  | .fill   512, 8, 0 | 
|  |  | 
|  | #undef PMDS | 
|  | #undef NEXT_PAGE | 
|  |  | 
|  | .data | 
|  | .align 16 | 
|  | .globl cpu_gdt_descr | 
|  | cpu_gdt_descr: | 
|  | .word	gdt_end-cpu_gdt_table-1 | 
|  | gdt: | 
|  | .quad	cpu_gdt_table | 
|  | #ifdef CONFIG_SMP | 
|  | .rept	NR_CPUS-1 | 
|  | .word	0 | 
|  | .quad	0 | 
|  | .endr | 
|  | #endif | 
|  |  | 
|  | ENTRY(phys_base) | 
|  | /* This must match the first entry in level2_kernel_pgt */ | 
|  | .quad   0x0000000000000000 | 
|  |  | 
|  | /* We need valid kernel segments for data and code in long mode too | 
|  | * IRET will check the segment types  kkeil 2000/10/28 | 
|  | * Also sysret mandates a special GDT layout | 
|  | */ | 
|  |  | 
|  | .section .data.page_aligned, "aw" | 
|  | .align PAGE_SIZE | 
|  |  | 
|  | /* The TLS descriptors are currently at a different place compared to i386. | 
|  | Hopefully nobody expects them at a fixed place (Wine?) */ | 
|  |  | 
|  | ENTRY(cpu_gdt_table) | 
|  | .quad	0x0000000000000000	/* NULL descriptor */ | 
|  | .quad	0x00cf9b000000ffff	/* __KERNEL32_CS */ | 
|  | .quad	0x00af9b000000ffff	/* __KERNEL_CS */ | 
|  | .quad	0x00cf93000000ffff	/* __KERNEL_DS */ | 
|  | .quad	0x00cffb000000ffff	/* __USER32_CS */ | 
|  | .quad	0x00cff3000000ffff	/* __USER_DS, __USER32_DS  */ | 
|  | .quad	0x00affb000000ffff	/* __USER_CS */ | 
|  | .quad	0x0			/* unused */ | 
|  | .quad	0,0			/* TSS */ | 
|  | .quad	0,0			/* LDT */ | 
|  | .quad   0,0,0			/* three TLS descriptors */ | 
|  | .quad	0x0000f40000000000	/* node/CPU stored in limit */ | 
|  | gdt_end: | 
|  | /* asm/segment.h:GDT_ENTRIES must match this */ | 
|  | /* This should be a multiple of the cache line size */ | 
|  | /* GDTs of other CPUs are now dynamically allocated */ | 
|  |  | 
|  | /* zero the remaining page */ | 
|  | .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0 | 
|  |  | 
|  | .section .bss, "aw", @nobits | 
|  | .align L1_CACHE_BYTES | 
|  | ENTRY(idt_table) | 
|  | .skip 256 * 16 | 
|  |  | 
|  | .section .bss.page_aligned, "aw", @nobits | 
|  | .align PAGE_SIZE | 
|  | ENTRY(empty_zero_page) | 
|  | .skip PAGE_SIZE |