From 4247b5b452298de8126a97693ea0c13f34f82970 Mon Sep 17 00:00:00 2001 From: Siddhesh Poyarekar Date: Tue, 25 Jun 2013 20:53:11 +0530 Subject: [PATCH] Fix libm performance regression due to set/restore rounding mode (#977887) --- glibc-rh977887-2.patch | 337 +++++++++++++++++++++++++++++++++++++++++ glibc-rh977887.patch | 46 ++++++ glibc.spec | 9 +- 3 files changed, 391 insertions(+), 1 deletion(-) create mode 100644 glibc-rh977887-2.patch create mode 100644 glibc-rh977887.patch diff --git a/glibc-rh977887-2.patch b/glibc-rh977887-2.patch new file mode 100644 index 0000000..05648bc --- /dev/null +++ b/glibc-rh977887-2.patch @@ -0,0 +1,337 @@ +commit 2506109403de69bd454de27835d42e6eb6ec3abc +Author: Siddhesh Poyarekar +Date: Wed Jun 12 10:36:48 2013 +0530 + + Set/restore rounding mode only when needed + + The most common use case of math functions is with default rounding + mode, i.e. rounding to nearest. Setting and restoring rounding mode + is an unnecessary overhead for this, so I've added support for a + context, which does the set/restore only if the FP status needs a + change. The code is written such that only x86 uses these. Other + architectures should be unaffected by it, but would definitely benefit + if the set/restore has as much overhead relative to the rest of the + code, as the x86 bits do. + + Here's a summary of the performance improvement due to these + improvements; I've only mentioned functions that use the set/restore + and have benchmark inputs for x86_64: + + Before: + + cos(): ITERS:4.69335e+08: TOTAL:28884.6Mcy, MAX:4080.28cy, MIN:57.562cy, 16248.6 calls/Mcy + exp(): ITERS:4.47604e+08: TOTAL:28796.2Mcy, MAX:207.721cy, MIN:62.385cy, 15543.9 calls/Mcy + pow(): ITERS:1.63485e+08: TOTAL:28879.9Mcy, MAX:362.255cy, MIN:172.469cy, 5660.86 calls/Mcy + sin(): ITERS:3.89578e+08: TOTAL:28900Mcy, MAX:704.859cy, MIN:47.583cy, 13480.2 calls/Mcy + tan(): ITERS:7.0971e+07: TOTAL:28902.2Mcy, MAX:1357.79cy, MIN:388.58cy, 2455.55 calls/Mcy + + After: + + cos(): ITERS:6.0014e+08: TOTAL:28875.9Mcy, MAX:364.283cy, MIN:45.716cy, 20783.4 calls/Mcy + exp(): ITERS:5.48578e+08: TOTAL:28764.9Mcy, MAX:191.617cy, MIN:51.011cy, 19071.1 calls/Mcy + pow(): ITERS:1.70013e+08: TOTAL:28873.6Mcy, MAX:689.522cy, MIN:163.989cy, 5888.18 calls/Mcy + sin(): ITERS:4.64079e+08: TOTAL:28891.5Mcy, MAX:6959.3cy, MIN:36.189cy, 16062.8 calls/Mcy + tan(): ITERS:7.2354e+07: TOTAL:28898.9Mcy, MAX:1295.57cy, MIN:380.698cy, 2503.7 calls/Mcy + + So the improvements are: + + cos: 27.9089% + exp: 22.6919% + pow: 4.01564% + sin: 19.1585% + tan: 1.96086% + + The downside of the change is that it will have an adverse performance + impact on non-default rounding modes, but I think the tradeoff is + justified. + +diff --git a/include/fenv.h b/include/fenv.h +index ed6d139..9f90d17 100644 +--- a/include/fenv.h ++++ b/include/fenv.h +@@ -1,5 +1,6 @@ + #ifndef _FENV_H + #include ++#include + + /* Now define the internal interfaces. */ + +@@ -23,4 +24,13 @@ libm_hidden_proto (fetestexcept) + libm_hidden_proto (feupdateenv) + libm_hidden_proto (fetestexcept) + ++/* Rounding mode context. This allows functions to set/restore rounding mode ++ only when the desired rounding mode is different from the current rounding ++ mode. */ ++struct rm_ctx ++{ ++ fenv_t env; ++ bool updated_status; ++}; ++ + #endif +diff --git a/sysdeps/generic/math_private.h b/sysdeps/generic/math_private.h +index e98360d..c0fc03d 100644 +--- a/sysdeps/generic/math_private.h ++++ b/sysdeps/generic/math_private.h +@@ -553,35 +553,62 @@ default_libc_feupdateenv_test (fenv_t *e, int ex) + # define libc_feresetround_noexl libc_fesetenvl + #endif + ++#if HAVE_RM_CTX ++/* Set/Restore Rounding Modes only when necessary. If defined, these functions ++ set/restore floating point state only if the state needed within the lexical ++ block is different from the current state. This saves a lot of time when ++ the floating point unit is much slower than the fixed point units. */ ++ ++# ifndef libc_feresetround_noex_ctx ++# define libc_feresetround_noex_ctx libc_fesetenv_ctx ++# endif ++# ifndef libc_feresetround_noexf_ctx ++# define libc_feresetround_noexf_ctx libc_fesetenvf_ctx ++# endif ++# ifndef libc_feresetround_noexl_ctx ++# define libc_feresetround_noexl_ctx libc_fesetenvl_ctx ++# endif ++ ++# ifndef libc_feholdsetround_53bit_ctx ++# define libc_feholdsetround_53bit_ctx libc_feholdsetround_ctx ++# endif ++ ++# ifndef libc_feresetround_53bit_ctx ++# define libc_feresetround_53bit_ctx libc_feresetround_ctx ++# endif ++ ++# define SET_RESTORE_ROUND_GENERIC(RM,ROUNDFUNC,CLEANUPFUNC) \ ++ struct rm_ctx ctx __attribute__((cleanup(CLEANUPFUNC ## _ctx))); \ ++ ROUNDFUNC ## _ctx (&ctx, (RM)) ++#else ++# define SET_RESTORE_ROUND_GENERIC(RM, ROUNDFUNC, CLEANUPFUNC) \ ++ fenv_t __libc_save_rm __attribute__((cleanup(CLEANUPFUNC))); \ ++ ROUNDFUNC (&__libc_save_rm, (RM)) ++#endif ++ + /* Save and restore the rounding mode within a lexical block. */ + + #define SET_RESTORE_ROUND(RM) \ +- fenv_t __libc_save_rm __attribute__((cleanup(libc_feresetround))); \ +- libc_feholdsetround (&__libc_save_rm, (RM)) ++ SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround, libc_feresetround) + #define SET_RESTORE_ROUNDF(RM) \ +- fenv_t __libc_save_rm __attribute__((cleanup(libc_feresetroundf))); \ +- libc_feholdsetroundf (&__libc_save_rm, (RM)) ++ SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetroundf, libc_feresetroundf) + #define SET_RESTORE_ROUNDL(RM) \ +- fenv_t __libc_save_rm __attribute__((cleanup(libc_feresetroundl))); \ +- libc_feholdsetroundl (&__libc_save_rm, (RM)) ++ SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetroundl, libc_feresetroundl) + + /* Save and restore the rounding mode within a lexical block, and also + the set of exceptions raised within the block may be discarded. */ + + #define SET_RESTORE_ROUND_NOEX(RM) \ +- fenv_t __libc_save_rm __attribute__((cleanup(libc_feresetround_noex))); \ +- libc_feholdsetround (&__libc_save_rm, (RM)) ++ SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround, libc_feresetround_noex) + #define SET_RESTORE_ROUND_NOEXF(RM) \ +- fenv_t __libc_save_rm __attribute__((cleanup(libc_feresetround_noexf))); \ +- libc_feholdsetroundf (&__libc_save_rm, (RM)) ++ SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetroundf, libc_feresetround_noexf) + #define SET_RESTORE_ROUND_NOEXL(RM) \ +- fenv_t __libc_save_rm __attribute__((cleanup(libc_feresetround_noexl))); \ +- libc_feholdsetroundl (&__libc_save_rm, (RM)) ++ SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetroundl, libc_feresetround_noexl) + + /* Like SET_RESTORE_ROUND, but also set rounding precision to 53 bits. */ + #define SET_RESTORE_ROUND_53BIT(RM) \ +- fenv_t __libc_save_rm __attribute__((cleanup(libc_feresetround_53bit))); \ +- libc_feholdsetround_53bit (&__libc_save_rm, (RM)) ++ SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround_53bit, \ ++ libc_feresetround_53bit) + + #define __nan(str) \ + (__builtin_constant_p (str) && str[0] == '\0' ? NAN : __nan (str)) +diff --git a/sysdeps/i386/fpu/fenv_private.h b/sysdeps/i386/fpu/fenv_private.h +index 1f8336c..3998387 100644 +--- a/sysdeps/i386/fpu/fenv_private.h ++++ b/sysdeps/i386/fpu/fenv_private.h +@@ -322,6 +322,179 @@ libc_feresetround_387 (fenv_t *e) + # define libc_feholdsetround_53bit libc_feholdsetround_387_53bit + #endif + ++/* We have support for rounding mode context. */ ++#define HAVE_RM_CTX 1 ++ ++static __always_inline void ++libc_feholdexcept_setround_sse_ctx (struct rm_ctx *ctx, int r) ++{ ++ unsigned int mxcsr, new_mxcsr; ++ asm (STMXCSR " %0" : "=m" (*&mxcsr)); ++ new_mxcsr = ((mxcsr | 0x1f80) & ~0x603f) | (r << 3); ++ ++ ctx->env.__mxcsr = mxcsr; ++ if (__glibc_unlikely (mxcsr != new_mxcsr)) ++ { ++ asm volatile (LDMXCSR " %0" : : "m" (*&new_mxcsr)); ++ ctx->updated_status = true; ++ } ++ else ++ ctx->updated_status = false; ++} ++ ++/* Unconditional since we want to overwrite any exceptions that occurred in the ++ context. This is also why all fehold* functions unconditionally write into ++ ctx->env. */ ++static __always_inline void ++libc_fesetenv_sse_ctx (struct rm_ctx *ctx) ++{ ++ libc_fesetenv_sse (&ctx->env); ++} ++ ++static __always_inline void ++libc_feupdateenv_sse_ctx (struct rm_ctx *ctx) ++{ ++ if (__glibc_unlikely (ctx->updated_status)) ++ libc_feupdateenv_test_sse (&ctx->env, 0); ++} ++ ++static __always_inline void ++libc_feholdexcept_setround_387_prec_ctx (struct rm_ctx *ctx, int r) ++{ ++ libc_feholdexcept_387 (&ctx->env); ++ ++ fpu_control_t cw = ctx->env.__control_word; ++ fpu_control_t old_cw = cw; ++ cw &= ~(_FPU_RC_ZERO | _FPU_EXTENDED); ++ cw |= r | 0x3f; ++ ++ if (__glibc_unlikely (old_cw != cw)) ++ { ++ _FPU_SETCW (cw); ++ ctx->updated_status = true; ++ } ++ else ++ ctx->updated_status = false; ++} ++ ++static __always_inline void ++libc_feholdexcept_setround_387_ctx (struct rm_ctx *ctx, int r) ++{ ++ libc_feholdexcept_setround_387_prec_ctx (ctx, r | _FPU_EXTENDED); ++} ++ ++static __always_inline void ++libc_feholdexcept_setround_387_53bit_ctx (struct rm_ctx *ctx, int r) ++{ ++ libc_feholdexcept_setround_387_prec_ctx (ctx, r | _FPU_DOUBLE); ++} ++ ++static __always_inline void ++libc_feholdsetround_387_prec_ctx (struct rm_ctx *ctx, int r) ++{ ++ fpu_control_t cw, new_cw; ++ ++ _FPU_GETCW (cw); ++ new_cw = cw; ++ new_cw &= ~(_FPU_RC_ZERO | _FPU_EXTENDED); ++ new_cw |= r; ++ ++ ctx->env.__control_word = cw; ++ if (__glibc_unlikely (new_cw != cw)) ++ { ++ _FPU_SETCW (new_cw); ++ ctx->updated_status = true; ++ } ++ else ++ ctx->updated_status = false; ++} ++ ++static __always_inline void ++libc_feholdsetround_387_ctx (struct rm_ctx *ctx, int r) ++{ ++ libc_feholdsetround_387_prec_ctx (ctx, r | _FPU_EXTENDED); ++} ++ ++static __always_inline void ++libc_feholdsetround_387_53bit_ctx (struct rm_ctx *ctx, int r) ++{ ++ libc_feholdsetround_387_prec_ctx (ctx, r | _FPU_DOUBLE); ++} ++ ++static __always_inline void ++libc_feholdsetround_sse_ctx (struct rm_ctx *ctx, int r) ++{ ++ unsigned int mxcsr, new_mxcsr; ++ ++ asm (STMXCSR " %0" : "=m" (*&mxcsr)); ++ new_mxcsr = (mxcsr & ~0x6000) | (r << 3); ++ ++ ctx->env.__mxcsr = mxcsr; ++ if (__glibc_unlikely (new_mxcsr != mxcsr)) ++ { ++ asm volatile (LDMXCSR " %0" : : "m" (*&new_mxcsr)); ++ ctx->updated_status = true; ++ } ++ else ++ ctx->updated_status = false; ++} ++ ++static __always_inline void ++libc_feresetround_sse_ctx (struct rm_ctx *ctx) ++{ ++ if (__glibc_unlikely (ctx->updated_status)) ++ libc_feresetround_sse (&ctx->env); ++} ++ ++static __always_inline void ++libc_feresetround_387_ctx (struct rm_ctx *ctx) ++{ ++ if (__glibc_unlikely (ctx->updated_status)) ++ _FPU_SETCW (ctx->env.__control_word); ++} ++ ++static __always_inline void ++libc_feupdateenv_387_ctx (struct rm_ctx *ctx) ++{ ++ if (__glibc_unlikely (ctx->updated_status)) ++ libc_feupdateenv_test_387 (&ctx->env, 0); ++} ++ ++#ifdef __SSE_MATH__ ++# define libc_feholdexcept_setroundf_ctx libc_feholdexcept_setround_sse_ctx ++# define libc_fesetenvf_ctx libc_fesetenv_sse_ctx ++# define libc_feupdateenvf_ctx libc_feupdateenv_sse_ctx ++# define libc_feholdsetroundf_ctx libc_feholdsetround_sse_ctx ++# define libc_feresetroundf_ctx libc_feresetround_sse_ctx ++#else ++# define libc_feholdexcept_setroundf_ctx libc_feholdexcept_setround_387_ctx ++# define libc_feupdateenvf_ctx libc_feupdateenv_387_ctx ++# define libc_feholdsetroundf_ctx libc_feholdsetround_387_ctx ++# define libc_feresetroundf_ctx libc_feresetround_387_ctx ++#endif /* __SSE_MATH__ */ ++ ++#ifdef __SSE2_MATH__ ++# define libc_feholdexcept_setround_ctx libc_feholdexcept_setround_sse_ctx ++# define libc_fesetenv_ctx libc_fesetenv_sse_ctx ++# define libc_feupdateenv_ctx libc_feupdateenv_sse_ctx ++# define libc_feholdsetround_ctx libc_feholdsetround_sse_ctx ++# define libc_feresetround_ctx libc_feresetround_sse_ctx ++#else ++# define libc_feholdexcept_setround_ctx libc_feholdexcept_setround_387_ctx ++# define libc_feupdateenv_ctx libc_feupdateenv_387_ctx ++# define libc_feresetround_ctx libc_feresetround_387_ctx ++#endif /* __SSE2_MATH__ */ ++ ++#define libc_feholdexcept_setroundl_ctx libc_feholdexcept_setround_387_ctx ++#define libc_feupdateenvl_ctx libc_feupdateenv_387_ctx ++#define libc_feholdsetroundl_ctx libc_feholdsetround_387_ctx ++#define libc_feresetroundl_ctx libc_feresetround_387_ctx ++ ++#ifndef __SSE2_MATH__ ++# define libc_feholdsetround_53bit_ctx libc_feholdsetround_387_53bit_ctx ++# define libc_feresetround_53bit_ctx libc_feresetround_387_ctx ++#endif ++ + #undef __mxcsr + + #endif /* FENV_PRIVATE_H */ diff --git a/glibc-rh977887.patch b/glibc-rh977887.patch new file mode 100644 index 0000000..69e990d --- /dev/null +++ b/glibc-rh977887.patch @@ -0,0 +1,46 @@ +commit 4c60cb0c8329dd498e9cce3735e5ee6212ad28f4 +Author: Siddhesh Poyarekar +Date: Wed Jun 5 13:56:19 2013 +0530 + + Skip modifying exception mask and flags in SET_RESTORE_ROUND_53BIT + + We only need to set/restore rounding mode to ensure correct + computation for non-default rounding modes. + +diff --git a/sysdeps/generic/math_private.h b/sysdeps/generic/math_private.h +index 9d6ecad..e98360d 100644 +--- a/sysdeps/generic/math_private.h ++++ b/sysdeps/generic/math_private.h +@@ -446,8 +446,8 @@ default_libc_feholdexcept_setround (fenv_t *e, int r) + # define libc_feholdexcept_setroundl default_libc_feholdexcept_setround + #endif + +-#ifndef libc_feholdexcept_setround_53bit +-# define libc_feholdexcept_setround_53bit libc_feholdexcept_setround ++#ifndef libc_feholdsetround_53bit ++# define libc_feholdsetround_53bit libc_feholdsetround + #endif + + #ifndef libc_fetestexcept +@@ -492,8 +492,8 @@ default_libc_feupdateenv (fenv_t *e) + # define libc_feupdateenvl default_libc_feupdateenv + #endif + +-#ifndef libc_feupdateenv_53bit +-# define libc_feupdateenv_53bit libc_feupdateenv ++#ifndef libc_feresetround_53bit ++# define libc_feresetround_53bit libc_feresetround + #endif + + static __always_inline int +@@ -580,8 +580,8 @@ default_libc_feupdateenv_test (fenv_t *e, int ex) + + /* Like SET_RESTORE_ROUND, but also set rounding precision to 53 bits. */ + #define SET_RESTORE_ROUND_53BIT(RM) \ +- fenv_t __libc_save_rm __attribute__((cleanup(libc_feupdateenv_53bit))); \ +- libc_feholdexcept_setround_53bit (&__libc_save_rm, (RM)) ++ fenv_t __libc_save_rm __attribute__((cleanup(libc_feresetround_53bit))); \ ++ libc_feholdsetround_53bit (&__libc_save_rm, (RM)) + + #define __nan(str) \ + (__builtin_constant_p (str) && str[0] == '\0' ? NAN : __nan (str)) diff --git a/glibc.spec b/glibc.spec index e13780c..ab52a64 100644 --- a/glibc.spec +++ b/glibc.spec @@ -28,7 +28,7 @@ Summary: The GNU libc libraries Name: glibc Version: %{glibcversion} -Release: 32%{?dist} +Release: 33%{?dist} # GPLv2+ is used in a bunch of programs, LGPLv2+ is used for libraries. # Things that are linked directly into dynamically linked programs # and shared libraries (e.g. crt files, lib*_nonshared.a) have an additional @@ -128,6 +128,8 @@ Patch1050: %{name}-rh811753.patch Patch1051: %{name}-rh811753-2.patch Patch1052: %{name}-rh890035.patch Patch1053: %{name}-rh905877.patch +Patch1054: %{name}-rh977887.patch +Patch1055: %{name}-rh977887-2.patch # # Patches submitted, but not yet approved upstream. @@ -485,6 +487,8 @@ rm -rf %{glibcportsdir} %patch1052 -p1 %patch1053 -p1 %patch0038 -p1 +%patch1054 -p1 +%patch1055 -p1 # On powerpc32, hp timing is only available in power4/power6 # libs, not in base, so pre-power4 dynamic linker is incompatible @@ -1292,6 +1296,9 @@ rm -f *.filelist* %endif %changelog +* Tue Jun 25 2013 Siddhesh Poyarekar - 2.16-33 + - Fix libm performance regression due to set/restore rounding mode (#977887). + * Sun May 5 2013 Patsy Franklin - 2.16-32 - Fix _nl_find_msg malloc failure case, and callers. (#959034).