percpu: align percpu readmostly subsection to cacheline

Currently percpu readmostly subsection may share cachelines with other
percpu subsections which may result in unnecessary cacheline bounce
and performance degradation.

This patch adds @cacheline parameter to PERCPU() and PERCPU_VADDR()
linker macros, makes each arch linker scripts specify its cacheline
size and use it to align percpu subsections.

This is based on Shaohua's x86 only patch.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Shaohua Li <shaohua.li@intel.com>
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index bf47007..cef446f 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -230,7 +230,7 @@
 	 * output PHDR, so the next output section - .init.text - should
 	 * start another segment - init.
 	 */
-	PERCPU_VADDR(0, :percpu)
+	PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu)
 #endif
 
 	INIT_TEXT_SECTION(PAGE_SIZE)
@@ -305,7 +305,7 @@
 	}
 
 #if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
-	PERCPU(THREAD_SIZE)
+	PERCPU(INTERNODE_CACHE_BYTES, THREAD_SIZE)
 #endif
 
 	. = ALIGN(PAGE_SIZE);