浏览代码

optimize x86 feclearexcept: only use save/restore x87 fenv if needed

the x87 exception summary (ES) and stack fault (SF) flags may be
spuriously cleared by feclearexcept using the fnclex instruction,
but these flags are not observable through libc hence maintaining
their state is not critical.
Szabolcs Nagy 11 年之前
父节点
当前提交
d8764bf840
共有 2 个文件被更改,包括 38 次插入27 次删除
  1. 27 12
      src/fenv/i386/fenv.s
  2. 11 15
      src/fenv/x86_64/fenv.s

+ 27 - 12
src/fenv/i386/fenv.s

@@ -4,26 +4,41 @@
 .type feclearexcept,@function
 .type feclearexcept,@function
 feclearexcept:	
 feclearexcept:	
 	mov 4(%esp),%ecx
 	mov 4(%esp),%ecx
-	not %ecx
+	fnstsw %ax
 		# consider sse fenv as well if the cpu has XMM capability
 		# consider sse fenv as well if the cpu has XMM capability
 	call 1f
 	call 1f
 1:	addl $__hwcap-1b,(%esp)
 1:	addl $__hwcap-1b,(%esp)
 	pop %edx
 	pop %edx
 	testl $0x02000000,(%edx)
 	testl $0x02000000,(%edx)
+	jz 2f
+		# maintain exceptions in the sse mxcsr, clear x87 exceptions
+	test %eax,%ecx
 	jz 1f
 	jz 1f
-	stmxcsr 4(%esp)
-	and %ecx,4(%esp)
-	ldmxcsr 4(%esp)
-1:	test $0x3f,%ecx
-	jnz 2f
-1:	fnclex
-	xor %eax,%eax
+	fnclex
+1:	push %edx
+	stmxcsr (%esp)
+	pop %edx
+	and $0x3f,%eax
+	or %eax,%edx
+	test %edx,%ecx
+	jz 1f
+	not %ecx
+	and %ecx,%edx
+	push %edx
+	ldmxcsr (%esp)
+	pop %edx
+1:	xor %eax,%eax
 	ret
 	ret
-2:	fnstsw %ax
-		# TODO: only load/store fenv if exceptions arent clear yet
-	and %ecx,%eax
+		# only do the expensive x87 fenv load/store when needed
+2:	test %eax,%ecx
 	jz 1b
 	jz 1b
-	sub $32,%esp
+	not %ecx
+	and %ecx,%eax
+	test $0x3f,%eax
+	jz 1f
+	fnclex
+	jmp 1b
+1:	sub $32,%esp
 	fnstenv (%esp)
 	fnstenv (%esp)
 	mov %al,4(%esp)
 	mov %al,4(%esp)
 	fldenv (%esp)
 	fldenv (%esp)

+ 11 - 15
src/fenv/x86_64/fenv.s

@@ -1,25 +1,21 @@
 .global feclearexcept
 .global feclearexcept
 .type feclearexcept,@function
 .type feclearexcept,@function
 feclearexcept:
 feclearexcept:
+		# maintain exceptions in the sse mxcsr, clear x87 exceptions
 	mov %edi,%ecx
 	mov %edi,%ecx
+	fnstsw %ax
+	test %eax,%ecx
+	jz 1f
+	fnclex
+1:	stmxcsr -8(%rsp)
+	and $0x3f,%eax
+	or %eax,-8(%rsp)
+	test %ecx,-8(%rsp)
+	jz 1f
 	not %ecx
 	not %ecx
-	stmxcsr -8(%rsp)
 	and %ecx,-8(%rsp)
 	and %ecx,-8(%rsp)
 	ldmxcsr -8(%rsp)
 	ldmxcsr -8(%rsp)
-	test $0x3f,%ecx
-	jnz 2f
-1:	fnclex
-	xor %eax,%eax
-	ret
-2:	fnstsw %ax
-	and %ecx,%eax
-	jz 1b
-	sub $32,%rsp
-	fnstenv (%rsp)
-	mov %al,4(%rsp)
-	fldenv (%rsp)
-	add $32,%rsp
-	xor %eax,%eax
+1:	xor %eax,%eax
 	ret
 	ret
 
 
 .global feraiseexcept
 .global feraiseexcept