This patch contains what was previously known as patch #5 and patch #8. It re-introduces the userRegs pointer in fpa11 structure, and changes the entry.S code to gain 2.5% speed improvement, mostly because it avoids repeated checks for FPE init on successive FP operations. What we WANT is to have the entry.S changes, but to NOT have the extra member in struct fpa11. Rather we want to use RMK's way of keeping the userRegs in 'sl' instead. I could not make this work, hence this patch as it stands. I will try to convert to the desired 'sl' behaviour in a follow-up patch. With this patch applied (and all the others) the unixbench float speed is confirmed at 925 lines-per-second on my test machine, an 11% improvement over the starting value of 833 lps. diff -uNr linux-2.4.19-rmk7-rfs6/arch/arm/nwfpe/entry.S linux-2.4.19-rmk7-rfs7/arch/arm/nwfpe/entry.S --- linux-2.4.19-rmk7-rfs6/arch/arm/nwfpe/entry.S 2003-03-22 13:33:25.000000000 -0500 +++ linux-2.4.19-rmk7-rfs7/arch/arm/nwfpe/entry.S 2003-03-23 20:00:02.000000000 -0500 @@ -1,7 +1,7 @@ /* NetWinder Floating Point Emulator (c) Rebel.COM, 1998 - (c) 1998, 1999 Philip Blundell + (c) 1998, 1999, 2001 Philip Blundell Direct questions, comments to Scott Bambrough @@ -50,8 +50,11 @@ This routine does three things: -1) The kernel has created a struct pt_regs on the stack and saved the -user registers into it. See inclue/asm-arm/proc/ptrace.h for details. +1) It saves SP into a variable called userRegisters. The kernel has +created a struct pt_regs on the stack and saved the user registers +into it. See /usr/include/asm/proc/ptrace.h for details. The +emulator code uses userRegisters as the base of an array of words from +which the contents of the registers can be extracted. 2) It calls EmulateAll to emulate a floating point instruction. EmulateAll returns 1 if the emulation was successful, or 0 if not. @@ -70,42 +73,58 @@ .globl nwfpe_enter nwfpe_enter: - mov r4, lr @ save the failure-return addresses - mov sl, sp @ we access the registers via 'sl' - ldr ip, [r10, #112] @ get init_flag - - ldr r5, [sp, #60] @ get contents of PC; - cmp ip, #0 - bleq nwfpe_init_fpa - sub r8, r5, #4 -.Lx1: ldrt r0, [r8] @ get actual instruction into r0 -emulate: - bl EmulateAll @ emulate the instruction - cmp r0, #0 @ was emulation successful - moveq pc, r4 @ no, return failure + /* ?? Could put userRegisters and fpa11 into fixed regs during + emulation. This would reduce load/store overhead at the expense + of stealing two regs from the register allocator. Not sure if + it's worth it. */ + mrs r7, cpsr @ enable irqs + bic ip, r7, #0x80 + msr cpsr_c, ip + str sp, [r10, #116] @ save pointer to user regs + ldr ip, [r10, #112] @ get init_flag + mov r4, sp @ use r4 for local pointer + mov r10, lr @ save the failure-return addresses + ldr r6, =nwfpe_aCC + + ldr r5, [r4, #60] @ get user PC + cmp ip, #0 + bleq nwfpe_init_fpa + sub r8, r5, #4 +.Lx1: ldrt r0, [r8] @ get actual instruction into r0 + bl EmulateAll @ emulate the instruction +ret_from_emulate: + cmp r0, #0 @ was emulation successful + beq failed next: -.Lx2: ldrt r6, [r5], #4 @ get the next instruction and +.Lx2: ldrt r0, [r5], #4 @ get the next instruction and @ increment PC + ldr r1, [r4, #64] @ fetch the condition codes - and r2, r6, #0x0F000000 @ test for FP insns - teq r2, #0x0C000000 - teqne r2, #0x0D000000 - teqne r2, #0x0E000000 - movne pc, r9 @ return ok if not a fp insn - - str r5, [sp, #60] @ update PC copy in regs - - mov r0, r6 @ save a copy - ldr r1, [sp, #64] @ fetch the condition codes - bl checkCondition @ check the condition - cmp r0, #0 @ r0 = 0 ==> condition failed + and r2, r0, #0x0F000000 @ test for FP insns + teq r2, #0x0C000000 + teqne r2, #0x0D000000 + teqne r2, #0x0E000000 + msrne cpsr_c, r7 + movne pc, r9 @ return ok if not a fp insn + + str r5, [r4, #60] @ update PC copy in regs + + mov r1, r1, lsr #28 @ check predicate against CC + mov ip, r0, lsr #28 + mov ip, ip, lsl #2 + ldr ip, [r6, ip] + mov ip, ip, lsr r1 + tst ip, #1 - @ if condition code failed to match, next insn - beq next @ get the next instruction; + adrne lr, ret_from_emulate @ condition passed, emulate it + bne EmulateAll + + b next @ get the next instruction - mov r0, r6 @ prepare for EmulateAll() - b emulate @ if r0 != 0, goto EmulateAll +failed: + msr cpsr_c, r7 + mov pc, r10 @ no, return failure @ We need to be prepared for the instructions at .Lx1 and .Lx2 @ to fault. Emit the appropriate exception gunk to fix things up. @@ -113,7 +132,8 @@ @ plain LDR instruction. Weird, but it seems harmless. .section .fixup,"ax" .align 2 -.Lfix: mov pc, r9 @ let the user eat segfaults +.Lfix: msr cpsr_c, r7 + mov pc, r9 @ let the user eat segfaults .previous .section __ex_table,"a" diff -uNr linux-2.4.19-rmk7-rfs6/arch/arm/nwfpe/fpa11.h linux-2.4.19-rmk7-rfs7/arch/arm/nwfpe/fpa11.h --- linux-2.4.19-rmk7-rfs6/arch/arm/nwfpe/fpa11.h 2003-03-22 13:17:22.000000000 -0500 +++ linux-2.4.19-rmk7-rfs7/arch/arm/nwfpe/fpa11.h 2003-03-23 19:58:13.000000000 -0500 @@ -22,25 +22,17 @@ #ifndef __FPA11_H__ #define __FPA11_H__ -#define GET_FPA11() ((FPA11 *)(¤t->thread.fpstate)) - -/* - * The processes registers are always at the very top of the 8K - * stack+task struct. Use the same method as 'current' uses to - * reach them. - */ -register unsigned int *user_registers asm("sl"); - -#define GET_USERREG() (user_registers) - -/* Need task_struct */ -#include - /* includes */ #include "fpsr.h" /* FP control and status register definitions */ #include "milieu.h" #include "softfloat.h" +/* #define NWFPE_DEBUG */ + +/* Need task_struct */ +#include +#include + #define typeNone 0x00 #define typeSingle 0x01 #define typeDouble 0x02 @@ -84,10 +76,18 @@ so we can use it to detect whether this instance of the emulator needs to be initialised. */ +/* 116 */ unsigned int *userRegisters; } FPA11; extern void resetFPA11(void); extern void SetRoundingMode(const unsigned int); extern void SetRoundingPrecision(const unsigned int); +#if LINUX_VERSION_CODE > 0x020400 +#define GET_FPA11() ((FPA11 *)(¤t->thread.fpstate)) +#else +#define GET_FPA11() ((FPA11 *)(¤t->tss.fpstate)) +#endif +#define GET_USERREG() (GET_FPA11()->userRegisters) + #endif diff -uNr linux-2.4.19-rmk7-rfs6/arch/arm/nwfpe/fpmodule.c linux-2.4.19-rmk7-rfs7/arch/arm/nwfpe/fpmodule.c --- linux-2.4.19-rmk7-rfs6/arch/arm/nwfpe/fpmodule.c 2003-03-22 13:17:22.000000000 -0500 +++ linux-2.4.19-rmk7-rfs7/arch/arm/nwfpe/fpmodule.c 2003-03-23 19:58:13.000000000 -0500 @@ -92,11 +92,6 @@ printk(KERN_ERR "nwfpe: bad structure size\n"); return -EINVAL; } - - if (sizeof(FPREG) != 12) { - printk(KERN_ERR "nwfpe: bad register size\n"); - return -EINVAL; - } #ifdef MODULE if (!mod_member_present(&__this_module, can_unload)) return -EINVAL; diff -uNr linux-2.4.19-rmk7-rfs6/arch/arm/nwfpe/fpopcode.c linux-2.4.19-rmk7-rfs7/arch/arm/nwfpe/fpopcode.c --- linux-2.4.19-rmk7-rfs6/arch/arm/nwfpe/fpopcode.c 2003-03-22 13:22:35.000000000 -0500 +++ linux-2.4.19-rmk7-rfs7/arch/arm/nwfpe/fpopcode.c 2003-03-23 20:00:02.000000000 -0500 @@ -64,7 +64,7 @@ /* condition code lookup table index into the table is test code: EQ, NE, ... LT, GT, AL, NV bit position in short is condition code: NZCV */ -static const unsigned short aCC[16] = { +const unsigned long nwfpe_aCC[16] = { 0xF0F0, // EQ == Z set 0x0F0F, // NE 0xCCCC, // CS == C set @@ -85,5 +85,5 @@ unsigned int checkCondition(const unsigned int opcode, const unsigned int ccodes) { - return (aCC[opcode >> 28] >> (ccodes >> 28)) & 1; + return (nwfpe_aCC[opcode >> 28] >> (ccodes >> 28)) & 1; }