Browse Source

optimize scalbn family

the fscale instruction is slow everywhere, probably because it
involves a costly and unnecessary integer truncation operation that
ends up being a no-op in common usages. instead, construct a floating
point scale value with integer arithmetic and simply multiply by it,
when possible.

for float and double, this is always possible by going to the
next-larger type. we use some cheap but effective saturating
arithmetic tricks to make sure even very large-magnitude exponents
fit. for long double, if the scaling exponent is too large to fit in
the exponent of a long double value, we simply fallback to the
expensive fscale method.

on atom cpu, these changes speed up scalbn by over 30%. (min rdtsc
timing dropped from 110 cycles to 70 cycles.)
Rich Felker 13 years ago
parent
commit
baa43bca0a
3 changed files with 46 additions and 7 deletions
  1. 16 3
      src/math/i386/scalbn.s
  2. 15 3
      src/math/i386/scalbnf.s
  3. 15 1
      src/math/i386/scalbnl.s

+ 16 - 3
src/math/i386/scalbn.s

@@ -11,10 +11,23 @@ scalbln:
 .global scalbn
 .type scalbn,@function
 scalbn:
-	fildl 12(%esp)
+	mov 12(%esp),%eax
+	add $0x3ffe,%eax
+	cmp $0x7ffd,%eax
+	jb 1f
+	sub $0x3ffe,%eax
+	sar $31,%eax
+	xor $0xfff,%eax
+	add $0x3ffe,%eax
+1:	inc %eax
 	fldl 4(%esp)
-	fscale
-	fstp %st(1)
+	mov %eax,12(%esp)
+	mov $0x80000000,%eax
+	mov %eax,8(%esp)
+	xor %eax,%eax
+	mov %eax,4(%esp)
+	fldt 4(%esp)
+	fmulp
 	fstpl 4(%esp)
 	fldl 4(%esp)
 	ret

+ 15 - 3
src/math/i386/scalbnf.s

@@ -11,10 +11,22 @@ scalblnf:
 .global scalbnf
 .type scalbnf,@function
 scalbnf:
-	fildl 8(%esp)
+	mov 8(%esp),%eax
+	add $0x3fe,%eax
+	cmp $0x7fd,%eax
+	jb 1f
+	sub $0x3fe,%eax
+	sar $31,%eax
+	xor $0x1ff,%eax
+	add $0x3fe,%eax
+1:	inc %eax
+	shl $20,%eax
 	flds 4(%esp)
-	fscale
-	fstp %st(1)
+	mov %eax,8(%esp)
+	xor %eax,%eax
+	mov %eax,4(%esp)
+	fldl 4(%esp)
+	fmulp
 	fstps 4(%esp)
 	flds 4(%esp)
 	ret

+ 15 - 1
src/math/i386/scalbnl.s

@@ -11,7 +11,21 @@ scalblnl:
 .global scalbnl
 .type scalbnl,@function
 scalbnl:
-	fildl 16(%esp)
+	mov 16(%esp),%eax
+	add $0x3ffe,%eax
+	cmp $0x7ffd,%eax
+	jae 1f
+	inc %eax
+	fldt 4(%esp)
+	mov %eax,12(%esp)
+	mov $0x80000000,%eax
+	mov %eax,8(%esp)
+	xor %eax,%eax
+	mov %eax,4(%esp)
+	fldt 4(%esp)
+	fmulp
+	ret
+1:	fildl 16(%esp)
 	fldt 4(%esp)
 	fscale
 	fstp %st(1)