up to 30% faster exp2 by avoiding slow frndint and fscale functions. expm1 also takes a much more direct path for small arguments (the expected usage case).
@@ -1,3 +1,37 @@
+.global expm1f
+.type expm1f,@function
+expm1f:
+ flds 4(%esp)
+ jmp 1f
+
+.global expm1l
+.type expm1l,@function
+expm1l:
+ fldt 4(%esp)
+.global expm1
+.type expm1,@function
+expm1:
+ fldl 4(%esp)
+1: fldl2e
+ fmulp
+ fld1
+ fld %st(1)
+ fabs
+ fucom %st(1)
+ fnstsw %ax
+ fstp %st(0)
+ sahf
+ ja 1f
+ f2xm1
+ ret
+1: call 1f
+ fsubrp
.global exp2f
.type exp2f,@function
exp2f:
@@ -34,22 +68,53 @@ exp:
.type exp2,@function
exp2:
fldl 4(%esp)
-1: fxam
- fnstsw %ax
+1: mov $0x47000000,%eax
+ push %eax
+ flds (%esp)
+ shl $7,%eax
+ add %eax,%eax
+ fnstsw
sahf
- jnp 1f
- jnc 1f
- fstps 4(%esp)
- mov $0xfe,%al
- and %al,7(%esp)
- flds 4(%esp)
-1: fld %st(0)
- frndint
+ ja 2f
+ fld %st(0)
+ fistpl 8(%esp)
+ fildl 8(%esp)
fxch %st(1)
fsub %st(1)
+ mov $0x3fff,%eax
+ add %eax,8(%esp)
f2xm1
fld1
faddp
- fscale
+ fldt (%esp)
fstp %st(1)
+ add $12,%esp
+2: fstp %st(0)
+ fsts 8(%esp)
+ mov 8(%esp),%eax
+ lea (%eax,%eax),%ecx
+ cmp $0xff000000,%ecx
+ xor %ecx,%ecx
+ inc %ecx
+ jc 1f
+ mov $0x7ffe,%ecx
+1: mov %ecx,8(%esp)
+2: add $12,%esp
ret
@@ -1,47 +1 @@
-.global expm1f
-.type expm1f,@function
-expm1f:
- jmp 1f
-
-.global expm1l
-.type expm1l,@function
-expm1l:
- fldt 4(%esp)
-.global expm1
-.type expm1,@function
-expm1:
- fldl 4(%esp)
- sahf
-1: fldl2e
- fmulp
- fld %st(0)
- fldz
- fcomp
- jnz 1f
- fstp %st(0)
- f2xm1
- ret
-1: fxch %st(1)
- fsub %st(1)
- fld1
- faddp
- fsubrp
- fstp %st(1)
+# see exp.s