From 4247b5b452298de8126a97693ea0c13f34f82970 Mon Sep 17 00:00:00 2001
From: Siddhesh Poyarekar <siddhesh.poyarekar@gmail.com>
Date: Tue, 25 Jun 2013 20:53:11 +0530
Subject: [PATCH]  Fix libm performance regression due to set/restore rounding
 mode (#977887)

---
 glibc-rh977887-2.patch | 337 +++++++++++++++++++++++++++++++++++++++++
 glibc-rh977887.patch   |  46 ++++++
 glibc.spec             |   9 +-
 3 files changed, 391 insertions(+), 1 deletion(-)
 create mode 100644 glibc-rh977887-2.patch
 create mode 100644 glibc-rh977887.patch

diff --git a/glibc-rh977887-2.patch b/glibc-rh977887-2.patch
new file mode 100644
index 0000000..05648bc
--- /dev/null
+++ b/glibc-rh977887-2.patch
@@ -0,0 +1,337 @@
+commit 2506109403de69bd454de27835d42e6eb6ec3abc
+Author: Siddhesh Poyarekar <siddhesh@redhat.com>
+Date:   Wed Jun 12 10:36:48 2013 +0530
+
+    Set/restore rounding mode only when needed
+    
+    The most common use case of math functions is with default rounding
+    mode, i.e. rounding to nearest.  Setting and restoring rounding mode
+    is an unnecessary overhead for this, so I've added support for a
+    context, which does the set/restore only if the FP status needs a
+    change.  The code is written such that only x86 uses these.  Other
+    architectures should be unaffected by it, but would definitely benefit
+    if the set/restore has as much overhead relative to the rest of the
+    code, as the x86 bits do.
+    
+    Here's a summary of the performance improvement due to these
+    improvements; I've only mentioned functions that use the set/restore
+    and have benchmark inputs for x86_64:
+    
+    Before:
+    
+    cos(): ITERS:4.69335e+08: TOTAL:28884.6Mcy, MAX:4080.28cy, MIN:57.562cy, 16248.6 calls/Mcy
+    exp(): ITERS:4.47604e+08: TOTAL:28796.2Mcy, MAX:207.721cy, MIN:62.385cy, 15543.9 calls/Mcy
+    pow(): ITERS:1.63485e+08: TOTAL:28879.9Mcy, MAX:362.255cy, MIN:172.469cy, 5660.86 calls/Mcy
+    sin(): ITERS:3.89578e+08: TOTAL:28900Mcy, MAX:704.859cy, MIN:47.583cy, 13480.2 calls/Mcy
+    tan(): ITERS:7.0971e+07: TOTAL:28902.2Mcy, MAX:1357.79cy, MIN:388.58cy, 2455.55 calls/Mcy
+    
+    After:
+    
+    cos(): ITERS:6.0014e+08: TOTAL:28875.9Mcy, MAX:364.283cy, MIN:45.716cy, 20783.4 calls/Mcy
+    exp(): ITERS:5.48578e+08: TOTAL:28764.9Mcy, MAX:191.617cy, MIN:51.011cy, 19071.1 calls/Mcy
+    pow(): ITERS:1.70013e+08: TOTAL:28873.6Mcy, MAX:689.522cy, MIN:163.989cy, 5888.18 calls/Mcy
+    sin(): ITERS:4.64079e+08: TOTAL:28891.5Mcy, MAX:6959.3cy, MIN:36.189cy, 16062.8 calls/Mcy
+    tan(): ITERS:7.2354e+07: TOTAL:28898.9Mcy, MAX:1295.57cy, MIN:380.698cy, 2503.7 calls/Mcy
+    
+    So the improvements are:
+    
+    cos: 27.9089%
+    exp: 22.6919%
+    pow: 4.01564%
+    sin: 19.1585%
+    tan: 1.96086%
+    
+    The downside of the change is that it will have an adverse performance
+    impact on non-default rounding modes, but I think the tradeoff is
+    justified.
+
+diff --git a/include/fenv.h b/include/fenv.h
+index ed6d139..9f90d17 100644
+--- a/include/fenv.h
++++ b/include/fenv.h
+@@ -1,5 +1,6 @@
+ #ifndef _FENV_H
+ #include <math/fenv.h>
++#include <stdbool.h>
+ 
+ /* Now define the internal interfaces.  */
+ 
+@@ -23,4 +24,13 @@ libm_hidden_proto (fetestexcept)
+ libm_hidden_proto (feupdateenv)
+ libm_hidden_proto (fetestexcept)
+ 
++/* Rounding mode context.  This allows functions to set/restore rounding mode
++   only when the desired rounding mode is different from the current rounding
++   mode.  */
++struct rm_ctx
++{
++  fenv_t env;
++  bool updated_status;
++};
++
+ #endif
+diff --git a/sysdeps/generic/math_private.h b/sysdeps/generic/math_private.h
+index e98360d..c0fc03d 100644
+--- a/sysdeps/generic/math_private.h
++++ b/sysdeps/generic/math_private.h
+@@ -553,35 +553,62 @@ default_libc_feupdateenv_test (fenv_t *e, int ex)
+ # define libc_feresetround_noexl libc_fesetenvl
+ #endif
+ 
++#if HAVE_RM_CTX
++/* Set/Restore Rounding Modes only when necessary.  If defined, these functions
++   set/restore floating point state only if the state needed within the lexical
++   block is different from the current state.  This saves a lot of time when
++   the floating point unit is much slower than the fixed point units.  */
++
++# ifndef libc_feresetround_noex_ctx
++#   define libc_feresetround_noex_ctx  libc_fesetenv_ctx
++# endif
++# ifndef libc_feresetround_noexf_ctx
++#   define libc_feresetround_noexf_ctx libc_fesetenvf_ctx
++# endif
++# ifndef libc_feresetround_noexl_ctx
++#   define libc_feresetround_noexl_ctx libc_fesetenvl_ctx
++# endif
++
++# ifndef libc_feholdsetround_53bit_ctx
++#   define libc_feholdsetround_53bit_ctx libc_feholdsetround_ctx
++# endif
++
++# ifndef libc_feresetround_53bit_ctx
++#   define libc_feresetround_53bit_ctx libc_feresetround_ctx
++# endif
++
++# define SET_RESTORE_ROUND_GENERIC(RM,ROUNDFUNC,CLEANUPFUNC) \
++  struct rm_ctx ctx __attribute__((cleanup(CLEANUPFUNC ## _ctx)));	      \
++  ROUNDFUNC ## _ctx (&ctx, (RM))
++#else
++# define SET_RESTORE_ROUND_GENERIC(RM, ROUNDFUNC, CLEANUPFUNC) \
++  fenv_t __libc_save_rm __attribute__((cleanup(CLEANUPFUNC)));	\
++  ROUNDFUNC (&__libc_save_rm, (RM))
++#endif
++
+ /* Save and restore the rounding mode within a lexical block.  */
+ 
+ #define SET_RESTORE_ROUND(RM) \
+-  fenv_t __libc_save_rm __attribute__((cleanup(libc_feresetround)));	\
+-  libc_feholdsetround (&__libc_save_rm, (RM))
++  SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround, libc_feresetround)
+ #define SET_RESTORE_ROUNDF(RM) \
+-  fenv_t __libc_save_rm __attribute__((cleanup(libc_feresetroundf)));	\
+-  libc_feholdsetroundf (&__libc_save_rm, (RM))
++  SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetroundf, libc_feresetroundf)
+ #define SET_RESTORE_ROUNDL(RM) \
+-  fenv_t __libc_save_rm __attribute__((cleanup(libc_feresetroundl)));	\
+-  libc_feholdsetroundl (&__libc_save_rm, (RM))
++  SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetroundl, libc_feresetroundl)
+ 
+ /* Save and restore the rounding mode within a lexical block, and also
+    the set of exceptions raised within the block may be discarded.  */
+ 
+ #define SET_RESTORE_ROUND_NOEX(RM) \
+-  fenv_t __libc_save_rm __attribute__((cleanup(libc_feresetround_noex))); \
+-  libc_feholdsetround (&__libc_save_rm, (RM))
++  SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround, libc_feresetround_noex)
+ #define SET_RESTORE_ROUND_NOEXF(RM) \
+-  fenv_t __libc_save_rm __attribute__((cleanup(libc_feresetround_noexf))); \
+-  libc_feholdsetroundf (&__libc_save_rm, (RM))
++  SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetroundf, libc_feresetround_noexf)
+ #define SET_RESTORE_ROUND_NOEXL(RM) \
+-  fenv_t __libc_save_rm __attribute__((cleanup(libc_feresetround_noexl))); \
+-  libc_feholdsetroundl (&__libc_save_rm, (RM))
++  SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetroundl, libc_feresetround_noexl)
+ 
+ /* Like SET_RESTORE_ROUND, but also set rounding precision to 53 bits.  */
+ #define SET_RESTORE_ROUND_53BIT(RM) \
+-  fenv_t __libc_save_rm __attribute__((cleanup(libc_feresetround_53bit))); \
+-  libc_feholdsetround_53bit (&__libc_save_rm, (RM))
++  SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround_53bit,	      \
++			     libc_feresetround_53bit)
+ 
+ #define __nan(str) \
+   (__builtin_constant_p (str) && str[0] == '\0' ? NAN : __nan (str))
+diff --git a/sysdeps/i386/fpu/fenv_private.h b/sysdeps/i386/fpu/fenv_private.h
+index 1f8336c..3998387 100644
+--- a/sysdeps/i386/fpu/fenv_private.h
++++ b/sysdeps/i386/fpu/fenv_private.h
+@@ -322,6 +322,179 @@ libc_feresetround_387 (fenv_t *e)
+ # define libc_feholdsetround_53bit	libc_feholdsetround_387_53bit
+ #endif
+ 
++/* We have support for rounding mode context.  */
++#define HAVE_RM_CTX 1
++
++static __always_inline void
++libc_feholdexcept_setround_sse_ctx (struct rm_ctx *ctx, int r)
++{
++  unsigned int mxcsr, new_mxcsr;
++  asm (STMXCSR " %0" : "=m" (*&mxcsr));
++  new_mxcsr = ((mxcsr | 0x1f80) & ~0x603f) | (r << 3);
++
++  ctx->env.__mxcsr = mxcsr;
++  if (__glibc_unlikely (mxcsr != new_mxcsr))
++    {
++      asm volatile (LDMXCSR " %0" : : "m" (*&new_mxcsr));
++      ctx->updated_status = true;
++    }
++  else
++    ctx->updated_status = false;
++}
++
++/* Unconditional since we want to overwrite any exceptions that occurred in the
++   context.  This is also why all fehold* functions unconditionally write into
++   ctx->env.  */
++static __always_inline void
++libc_fesetenv_sse_ctx (struct rm_ctx *ctx)
++{
++  libc_fesetenv_sse (&ctx->env);
++}
++
++static __always_inline void
++libc_feupdateenv_sse_ctx (struct rm_ctx *ctx)
++{
++  if (__glibc_unlikely (ctx->updated_status))
++    libc_feupdateenv_test_sse (&ctx->env, 0);
++}
++
++static __always_inline void
++libc_feholdexcept_setround_387_prec_ctx (struct rm_ctx *ctx, int r)
++{
++  libc_feholdexcept_387 (&ctx->env);
++
++  fpu_control_t cw = ctx->env.__control_word;
++  fpu_control_t old_cw = cw;
++  cw &= ~(_FPU_RC_ZERO | _FPU_EXTENDED);
++  cw |= r | 0x3f;
++
++  if (__glibc_unlikely (old_cw != cw))
++    {
++      _FPU_SETCW (cw);
++      ctx->updated_status = true;
++    }
++  else
++    ctx->updated_status = false;
++}
++
++static __always_inline void
++libc_feholdexcept_setround_387_ctx (struct rm_ctx *ctx, int r)
++{
++  libc_feholdexcept_setround_387_prec_ctx (ctx, r | _FPU_EXTENDED);
++}
++
++static __always_inline void
++libc_feholdexcept_setround_387_53bit_ctx (struct rm_ctx *ctx, int r)
++{
++  libc_feholdexcept_setround_387_prec_ctx (ctx, r | _FPU_DOUBLE);
++}
++
++static __always_inline void
++libc_feholdsetround_387_prec_ctx (struct rm_ctx *ctx, int r)
++{
++  fpu_control_t cw, new_cw;
++
++  _FPU_GETCW (cw);
++  new_cw = cw;
++  new_cw &= ~(_FPU_RC_ZERO | _FPU_EXTENDED);
++  new_cw |= r;
++
++  ctx->env.__control_word = cw;
++  if (__glibc_unlikely (new_cw != cw))
++    {
++      _FPU_SETCW (new_cw);
++      ctx->updated_status = true;
++    }
++  else
++    ctx->updated_status = false;
++}
++
++static __always_inline void
++libc_feholdsetround_387_ctx (struct rm_ctx *ctx, int r)
++{
++  libc_feholdsetround_387_prec_ctx (ctx, r | _FPU_EXTENDED);
++}
++
++static __always_inline void
++libc_feholdsetround_387_53bit_ctx (struct rm_ctx *ctx, int r)
++{
++  libc_feholdsetround_387_prec_ctx (ctx, r | _FPU_DOUBLE);
++}
++
++static __always_inline void
++libc_feholdsetround_sse_ctx (struct rm_ctx *ctx, int r)
++{
++  unsigned int mxcsr, new_mxcsr;
++
++  asm (STMXCSR " %0" : "=m" (*&mxcsr));
++  new_mxcsr = (mxcsr & ~0x6000) | (r << 3);
++
++  ctx->env.__mxcsr = mxcsr;
++  if (__glibc_unlikely (new_mxcsr != mxcsr))
++    {
++      asm volatile (LDMXCSR " %0" : : "m" (*&new_mxcsr));
++      ctx->updated_status = true;
++    }
++  else
++    ctx->updated_status = false;
++}
++
++static __always_inline void
++libc_feresetround_sse_ctx (struct rm_ctx *ctx)
++{
++  if (__glibc_unlikely (ctx->updated_status))
++    libc_feresetround_sse (&ctx->env);
++}
++
++static __always_inline void
++libc_feresetround_387_ctx (struct rm_ctx *ctx)
++{
++  if (__glibc_unlikely (ctx->updated_status))
++    _FPU_SETCW (ctx->env.__control_word);
++}
++
++static __always_inline void
++libc_feupdateenv_387_ctx (struct rm_ctx *ctx)
++{
++  if (__glibc_unlikely (ctx->updated_status))
++    libc_feupdateenv_test_387 (&ctx->env, 0);
++}
++
++#ifdef __SSE_MATH__
++# define libc_feholdexcept_setroundf_ctx libc_feholdexcept_setround_sse_ctx
++# define libc_fesetenvf_ctx		libc_fesetenv_sse_ctx
++# define libc_feupdateenvf_ctx		libc_feupdateenv_sse_ctx
++# define libc_feholdsetroundf_ctx	libc_feholdsetround_sse_ctx
++# define libc_feresetroundf_ctx		libc_feresetround_sse_ctx
++#else
++# define libc_feholdexcept_setroundf_ctx libc_feholdexcept_setround_387_ctx
++# define libc_feupdateenvf_ctx		libc_feupdateenv_387_ctx
++# define libc_feholdsetroundf_ctx	libc_feholdsetround_387_ctx
++# define libc_feresetroundf_ctx		libc_feresetround_387_ctx
++#endif /* __SSE_MATH__ */
++
++#ifdef __SSE2_MATH__
++# define libc_feholdexcept_setround_ctx	libc_feholdexcept_setround_sse_ctx
++# define libc_fesetenv_ctx		libc_fesetenv_sse_ctx
++# define libc_feupdateenv_ctx		libc_feupdateenv_sse_ctx
++# define libc_feholdsetround_ctx	libc_feholdsetround_sse_ctx
++# define libc_feresetround_ctx		libc_feresetround_sse_ctx
++#else
++# define libc_feholdexcept_setround_ctx	libc_feholdexcept_setround_387_ctx
++# define libc_feupdateenv_ctx		libc_feupdateenv_387_ctx
++# define libc_feresetround_ctx		libc_feresetround_387_ctx
++#endif /* __SSE2_MATH__ */
++
++#define libc_feholdexcept_setroundl_ctx	libc_feholdexcept_setround_387_ctx
++#define libc_feupdateenvl_ctx		libc_feupdateenv_387_ctx
++#define libc_feholdsetroundl_ctx	libc_feholdsetround_387_ctx
++#define libc_feresetroundl_ctx		libc_feresetround_387_ctx
++
++#ifndef __SSE2_MATH__
++# define libc_feholdsetround_53bit_ctx	libc_feholdsetround_387_53bit_ctx
++# define libc_feresetround_53bit_ctx	libc_feresetround_387_ctx
++#endif
++
+ #undef __mxcsr
+ 
+ #endif /* FENV_PRIVATE_H */
diff --git a/glibc-rh977887.patch b/glibc-rh977887.patch
new file mode 100644
index 0000000..69e990d
--- /dev/null
+++ b/glibc-rh977887.patch
@@ -0,0 +1,46 @@
+commit 4c60cb0c8329dd498e9cce3735e5ee6212ad28f4
+Author: Siddhesh Poyarekar <siddhesh@redhat.com>
+Date:   Wed Jun 5 13:56:19 2013 +0530
+
+    Skip modifying exception mask and flags in SET_RESTORE_ROUND_53BIT
+    
+    We only need to set/restore rounding mode to ensure correct
+    computation for non-default rounding modes.
+
+diff --git a/sysdeps/generic/math_private.h b/sysdeps/generic/math_private.h
+index 9d6ecad..e98360d 100644
+--- a/sysdeps/generic/math_private.h
++++ b/sysdeps/generic/math_private.h
+@@ -446,8 +446,8 @@ default_libc_feholdexcept_setround (fenv_t *e, int r)
+ # define libc_feholdexcept_setroundl default_libc_feholdexcept_setround
+ #endif
+ 
+-#ifndef libc_feholdexcept_setround_53bit
+-# define libc_feholdexcept_setround_53bit libc_feholdexcept_setround
++#ifndef libc_feholdsetround_53bit
++# define libc_feholdsetround_53bit libc_feholdsetround
+ #endif
+ 
+ #ifndef libc_fetestexcept
+@@ -492,8 +492,8 @@ default_libc_feupdateenv (fenv_t *e)
+ # define libc_feupdateenvl default_libc_feupdateenv
+ #endif
+ 
+-#ifndef libc_feupdateenv_53bit
+-# define libc_feupdateenv_53bit libc_feupdateenv
++#ifndef libc_feresetround_53bit
++# define libc_feresetround_53bit libc_feresetround
+ #endif
+ 
+ static __always_inline int
+@@ -580,8 +580,8 @@ default_libc_feupdateenv_test (fenv_t *e, int ex)
+ 
+ /* Like SET_RESTORE_ROUND, but also set rounding precision to 53 bits.  */
+ #define SET_RESTORE_ROUND_53BIT(RM) \
+-  fenv_t __libc_save_rm __attribute__((cleanup(libc_feupdateenv_53bit))); \
+-  libc_feholdexcept_setround_53bit (&__libc_save_rm, (RM))
++  fenv_t __libc_save_rm __attribute__((cleanup(libc_feresetround_53bit))); \
++  libc_feholdsetround_53bit (&__libc_save_rm, (RM))
+ 
+ #define __nan(str) \
+   (__builtin_constant_p (str) && str[0] == '\0' ? NAN : __nan (str))
diff --git a/glibc.spec b/glibc.spec
index e13780c..ab52a64 100644
--- a/glibc.spec
+++ b/glibc.spec
@@ -28,7 +28,7 @@
 Summary: The GNU libc libraries
 Name: glibc
 Version: %{glibcversion}
-Release: 32%{?dist}
+Release: 33%{?dist}
 # GPLv2+ is used in a bunch of programs, LGPLv2+ is used for libraries.
 # Things that are linked directly into dynamically linked programs
 # and shared libraries (e.g. crt files, lib*_nonshared.a) have an additional
@@ -128,6 +128,8 @@ Patch1050: %{name}-rh811753.patch
 Patch1051: %{name}-rh811753-2.patch
 Patch1052: %{name}-rh890035.patch
 Patch1053: %{name}-rh905877.patch
+Patch1054: %{name}-rh977887.patch
+Patch1055: %{name}-rh977887-2.patch
 
 #
 # Patches submitted, but not yet approved upstream.
@@ -485,6 +487,8 @@ rm -rf %{glibcportsdir}
 %patch1052 -p1
 %patch1053 -p1
 %patch0038 -p1
+%patch1054 -p1
+%patch1055 -p1
 
 # On powerpc32, hp timing is only available in power4/power6
 # libs, not in base, so pre-power4 dynamic linker is incompatible
@@ -1292,6 +1296,9 @@ rm -f *.filelist*
 %endif
 
 %changelog
+* Tue Jun 25 2013 Siddhesh Poyarekar <siddhesh@redhat.com> - 2.16-33
+  - Fix libm performance regression due to set/restore rounding mode (#977887).
+
 * Sun May  5 2013 Patsy Franklin <pfrankli@redhat.com> - 2.16-32
   - Fix _nl_find_msg malloc failure case, and callers. (#959034).