sh: Migrate necessary libgcc bits in to arch/sh/lib for SUPERH32.

This moves in the necessary libgcc bits for SUPERH32 and drops the
libgcc linking for the regular targets. This in turn allows us to rip
out quite a few hacks both in sh_ksyms_32 and arch/sh/Makefile.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
diff --git a/arch/sh/lib/udivsi3-Os.S b/arch/sh/lib/udivsi3-Os.S
new file mode 100644
index 0000000..2bed765
--- /dev/null
+++ b/arch/sh/lib/udivsi3-Os.S
@@ -0,0 +1,147 @@
+/* Copyright (C) 2006 Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+
+In addition to the permissions in the GNU General Public License, the
+Free Software Foundation gives you unlimited permission to link the
+compiled version of this file into combinations with other programs,
+and to distribute those combinations without any restriction coming
+from the use of this file.  (The General Public License restrictions
+do apply in other respects; for example, they cover modification of
+the file, and distribution when not linked into a combine
+executable.)
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; see the file COPYING.  If not, write to
+the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+Boston, MA 02110-1301, USA.  */
+
+/* Moderately Space-optimized libgcc routines for the Renesas SH /
+   STMicroelectronics ST40 CPUs.
+   Contributed by J"orn Rennecke joern.rennecke@st.com.  */
+
+/* Size: 186 bytes jointly for udivsi3_i4i and sdivsi3_i4i
+   sh4-200 run times:
+   udiv small divisor: 55 cycles
+   udiv large divisor: 52 cycles
+   sdiv small divisor, positive result: 59 cycles
+   sdiv large divisor, positive result: 56 cycles
+   sdiv small divisor, negative result: 65 cycles (*)
+   sdiv large divisor, negative result: 62 cycles (*)
+   (*): r2 is restored in the rts delay slot and has a lingering latency
+        of two more cycles.  */
+	.balign 4
+	.global	__udivsi3_i4i
+	.global __udivsi3
+	.set	__udivsi3, __udivsi3_i4i
+	.type	__udivsi3_i4i, @function
+	.type	__sdivsi3_i4i, @function
+__udivsi3_i4i:
+	sts pr,r1
+	mov.l r4,@-r15
+	extu.w r5,r0
+	cmp/eq r5,r0
+	swap.w r4,r0
+	shlr16 r4
+	bf/s large_divisor
+	div0u
+	mov.l r5,@-r15
+	shll16 r5
+sdiv_small_divisor:
+	div1 r5,r4
+	bsr div6
+	div1 r5,r4
+	div1 r5,r4
+	bsr div6
+	div1 r5,r4
+	xtrct r4,r0
+	xtrct r0,r4
+	bsr div7
+	swap.w r4,r4
+	div1 r5,r4
+	bsr div7
+	div1 r5,r4
+	xtrct r4,r0
+	mov.l @r15+,r5
+	swap.w r0,r0
+	mov.l @r15+,r4
+	jmp @r1
+	rotcl r0
+div7:
+	div1 r5,r4
+div6:
+	            div1 r5,r4; div1 r5,r4; div1 r5,r4
+	div1 r5,r4; div1 r5,r4; rts;        div1 r5,r4
+
+divx3:
+	rotcl r0
+	div1 r5,r4
+	rotcl r0
+	div1 r5,r4
+	rotcl r0
+	rts
+	div1 r5,r4
+
+large_divisor:
+	mov.l r5,@-r15
+sdiv_large_divisor:
+	xor r4,r0
+	.rept 4
+	rotcl r0
+	bsr divx3
+	div1 r5,r4
+	.endr
+	mov.l @r15+,r5
+	mov.l @r15+,r4
+	jmp @r1
+	rotcl r0
+
+	.global	__sdivsi3_i4i
+	.global __sdivsi3
+	.set	__sdivsi3, __sdivsi3_i4i
+__sdivsi3_i4i:
+	mov.l r4,@-r15
+	cmp/pz r5
+	mov.l r5,@-r15
+	bt/s pos_divisor
+	cmp/pz r4
+	neg r5,r5
+	extu.w r5,r0
+	bt/s neg_result
+	cmp/eq r5,r0
+	neg r4,r4
+pos_result:
+	swap.w r4,r0
+	bra sdiv_check_divisor
+	sts pr,r1
+pos_divisor:
+	extu.w r5,r0
+	bt/s pos_result
+	cmp/eq r5,r0
+	neg r4,r4
+neg_result:
+	mova negate_result,r0
+	;
+	mov r0,r1
+	swap.w r4,r0
+	lds r2,macl
+	sts pr,r2
+sdiv_check_divisor:
+	shlr16 r4
+	bf/s sdiv_large_divisor
+	div0u
+	bra sdiv_small_divisor
+	shll16 r5
+	.balign 4
+negate_result:
+	neg r0,r0
+	jmp @r2
+	sts macl,r2