python3.12/00262-pep538_coerce_legacy_...

576 lines
20 KiB
Diff

diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst
index c0e64d6..0bb28da 100644
--- a/Doc/using/cmdline.rst
+++ b/Doc/using/cmdline.rst
@@ -711,6 +711,35 @@ conflict.
.. versionadded:: 3.6
+
+.. envvar:: PYTHONCOERCECLOCALE
+
+ If set to a non-empty string, causes the main Python command line application
+ to skip coercing the legacy ASCII-based C locale to a more capable UTF-8
+ based alternative. Note that this setting is checked even when the
+ :option:`-E` or :option:`-I` options are used, as it is handled prior to
+ the processing of command line options.
+
+ If this variable is *not* set, and the current locale reported for the
+ ``LC_CTYPE`` category is the default ``C`` locale, then the Python CLI will
+ attempt to configure one of the following locales for the given locale
+ categories before loading the interpreter runtime:
+
+ * ``C.UTF-8` (``LC_ALL``)
+ * ``C.utf8` (``LC_ALL``)
+ * ``UTF-8` (``LC_CTYPE``)
+
+ If setting one of these locale categories succeeds, then the matching
+ environment variables will be set (both ``LC_ALL` and ``LANG`` for the
+ ``LC_ALL`` category, and ``LC_CTYPE`` for the ``LC_CTYPE`` category),
+ and (if not already set to a non-empty string) :envvar:`PYTHONIOENCODING`
+ will be set to ``utf-8:surrogateescape``.
+
+ Availability: \*nix
+
+ .. versionadded:: 3.7
+ See :pep:`538` for more details.
+
Debug-mode variables
~~~~~~~~~~~~~~~~~~~~
diff --git a/Lib/test/support/script_helper.py b/Lib/test/support/script_helper.py
index 80889b1..1a1a862 100644
--- a/Lib/test/support/script_helper.py
+++ b/Lib/test/support/script_helper.py
@@ -51,8 +51,35 @@ def interpreter_requires_environment():
return __cached_interp_requires_environment
-_PythonRunResult = collections.namedtuple("_PythonRunResult",
- ("rc", "out", "err"))
+class _PythonRunResult(collections.namedtuple("_PythonRunResult",
+ ("rc", "out", "err"))):
+ """Helper for reporting Python subprocess run results"""
+ def fail(self, cmd_line):
+ """Provide helpful details about failed subcommand runs"""
+ # Limit to 80 lines to ASCII characters
+ maxlen = 80 * 100
+ out, err = self.out, self.err
+ if len(out) > maxlen:
+ out = b'(... truncated stdout ...)' + out[-maxlen:]
+ if len(err) > maxlen:
+ err = b'(... truncated stderr ...)' + err[-maxlen:]
+ out = out.decode('ascii', 'replace').rstrip()
+ err = err.decode('ascii', 'replace').rstrip()
+ raise AssertionError("Process return code is %d\n"
+ "command line: %r\n"
+ "\n"
+ "stdout:\n"
+ "---\n"
+ "%s\n"
+ "---\n"
+ "\n"
+ "stderr:\n"
+ "---\n"
+ "%s\n"
+ "---"
+ % (self.rc, cmd_line,
+ out,
+ err))
# Executing the interpreter in a subprocess
@@ -99,30 +126,7 @@ def run_python_until_end(*args, **env_vars):
def _assert_python(expected_success, *args, **env_vars):
res, cmd_line = run_python_until_end(*args, **env_vars)
if (res.rc and expected_success) or (not res.rc and not expected_success):
- # Limit to 80 lines to ASCII characters
- maxlen = 80 * 100
- out, err = res.out, res.err
- if len(out) > maxlen:
- out = b'(... truncated stdout ...)' + out[-maxlen:]
- if len(err) > maxlen:
- err = b'(... truncated stderr ...)' + err[-maxlen:]
- out = out.decode('ascii', 'replace').rstrip()
- err = err.decode('ascii', 'replace').rstrip()
- raise AssertionError("Process return code is %d\n"
- "command line: %r\n"
- "\n"
- "stdout:\n"
- "---\n"
- "%s\n"
- "---\n"
- "\n"
- "stderr:\n"
- "---\n"
- "%s\n"
- "---"
- % (res.rc, cmd_line,
- out,
- err))
+ res.fail(cmd_line)
return res
def assert_python_ok(*args, **env_vars):
diff --git a/Lib/test/test_capi.py b/Lib/test/test_capi.py
index 2a53f3d..ece84af 100644
--- a/Lib/test/test_capi.py
+++ b/Lib/test/test_capi.py
@@ -386,7 +386,7 @@ class EmbeddingTests(unittest.TestCase):
def test_subinterps(self):
# This is just a "don't crash" test
out, err = self.run_embedded_interpreter("repeated_init_and_subinterpreters")
- if support.verbose:
+ if support.verbose > 1:
print()
print(out)
print(err)
@@ -404,14 +404,15 @@ class EmbeddingTests(unittest.TestCase):
def test_forced_io_encoding(self):
# Checks forced configuration of embedded interpreter IO streams
out, err = self.run_embedded_interpreter("forced_io_encoding")
- if support.verbose:
+ if support.verbose > 1:
print()
print(out)
print(err)
- expected_errors = sys.__stdout__.errors
- expected_stdin_encoding = sys.__stdin__.encoding
+ expected_errors = "surrogateescape"
+ expected_stdin_encoding = "UTF-8"
expected_pipe_encoding = self._get_default_pipe_encoding()
expected_output = '\n'.join([
+ "Setting PYTHONIOENCODING=UTF-8:surrogateescape",
"--- Use defaults ---",
"Expected encoding: default",
"Expected errors: default",
diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py
index b71bb9f..56867fc 100644
--- a/Lib/test/test_cmd_line.py
+++ b/Lib/test/test_cmd_line.py
@@ -9,8 +9,9 @@ import sys
import subprocess
import tempfile
from test.support import script_helper, is_android
-from test.support.script_helper import (spawn_python, kill_python, assert_python_ok,
- assert_python_failure)
+from test.support.script_helper import (
+ spawn_python, kill_python, assert_python_ok, assert_python_failure
+)
# XXX (ncoghlan): Move to script_helper and make consistent with run_python
@@ -151,6 +152,7 @@ class CmdLineTest(unittest.TestCase):
env = os.environ.copy()
# Use C locale to get ascii for the locale encoding
env['LC_ALL'] = 'C'
+ env['PYTHONCOERCECLOCALE'] = '0'
code = (
b'import locale; '
b'print(ascii("' + undecodable + b'"), '
diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py
index df9ebd4..63145e4 100644
--- a/Lib/test/test_sys.py
+++ b/Lib/test/test_sys.py
@@ -680,6 +680,7 @@ class SysModuleTest(unittest.TestCase):
# Force the POSIX locale
env = os.environ.copy()
env["LC_ALL"] = "C"
+ env["PYTHONCOERCECLOCALE"] = "0"
code = '\n'.join((
'import sys',
'def dump(name):',
diff --git a/Programs/_testembed.c b/Programs/_testembed.c
index a68d4fa..1494452 100644
--- a/Programs/_testembed.c
+++ b/Programs/_testembed.c
@@ -1,4 +1,5 @@
-#include <Python.h>
+#include "Python.h"
+#include "pyconfig.h"
#include <stdio.h>
/*********************************************************
@@ -106,6 +107,9 @@ static void check_stdio_details(const char *encoding, const char * errors)
static int test_forced_io_encoding(void)
{
+ /* Ensure consistent "defaults" */
+ printf("Setting PYTHONIOENCODING=UTF-8:surrogateescape\n");
+ setenv("PYTHONIOENCODING", "UTF-8:surrogateescape", 1);
/* Check various combinations */
printf("--- Use defaults ---\n");
check_stdio_details(NULL, NULL);
@@ -126,6 +130,20 @@ static int test_forced_io_encoding(void)
return 0;
}
+static int test_c_locale_warning(void)
+{
+#ifdef PY_WARN_ON_C_LOCALE
+ /* Force use of the C locale */
+ setenv("LC_ALL", "C", 1);
+
+ _testembed_Py_Initialize();
+ Py_Finalize();
+#else
+ printf("C locale compatibility warning disabled at compile time\n");
+#endif
+ return 0;
+}
+
/* *********************************************************
* List of test cases and the function that implements it.
*
@@ -147,6 +165,7 @@ struct TestCase
static struct TestCase TestCases[] = {
{ "forced_io_encoding", test_forced_io_encoding },
{ "repeated_init_and_subinterpreters", test_repeated_init_and_subinterpreters },
+ { "c_locale_warning", test_c_locale_warning },
{ NULL, NULL }
};
diff --git a/Programs/python.c b/Programs/python.c
index a7afbc7..b5edebb 100644
--- a/Programs/python.c
+++ b/Programs/python.c
@@ -15,6 +15,110 @@ wmain(int argc, wchar_t **argv)
}
#else
+/* Helpers to better handle the legacy C locale
+ *
+ * The legacy C locale assumes ASCII as the default text encoding, which
+ * causes problems not only for the CPython runtime, but also other
+ * components like GNU readline.
+ *
+ * Accordingly, when the CLI detects it, it attempts to coerce it to a
+ * more capable UTF-8 based alternative.
+ *
+ * See the documentation of the PYTHONCOERCECLOCALE setting for more details.
+ *
+ */
+
+#ifdef PY_COERCE_C_LOCALE
+static const char *_C_LOCALE_COERCION_WARNING =
+ "Python detected LC_CTYPE=C: %.20s coerced to %.20s (set another locale "
+ "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behaviour).\n";
+
+typedef struct _CandidateLocale {
+ const char *locale_name;
+ int category;
+} _LocaleCoercionTarget;
+
+static _LocaleCoercionTarget _TARGET_LOCALES[] = {
+ { "C.UTF-8", LC_ALL },
+ { "C.utf8", LC_ALL },
+ { "UTF-8", LC_CTYPE },
+ { NULL, 0 }
+};
+
+void
+_coerce_default_locale_settings(const _LocaleCoercionTarget *target)
+{
+ const char *newloc = target->locale_name;
+ int category = target->category;
+
+ /* Reset locale back to currently configured defaults */
+ setlocale(LC_ALL, "");
+
+ /* Set the relevant locale environment variables */
+ if (category == LC_ALL) {
+ const char *env_vars_updated = "LC_ALL & LANG";
+ if (setenv("LC_ALL", newloc, 1)) {
+ fprintf(stderr,
+ "Error setting LC_ALL, skipping C locale coercion\n");
+ return;
+ }
+ if (setenv("LANG", newloc, 1)) {
+ fprintf(stderr,
+ "Error setting LANG during C locale coercion\n");
+ env_vars_updated = "LC_ALL";
+ }
+ fprintf(stderr, _C_LOCALE_COERCION_WARNING, env_vars_updated, newloc);
+ } else if (category == LC_CTYPE) {
+ if (setenv("LC_CTYPE", newloc, 1)) {
+ fprintf(stderr,
+ "Error setting LC_CTYPE, skipping C locale coercion\n");
+ return;
+ }
+ fprintf(stderr, _C_LOCALE_COERCION_WARNING, "LC_CTYPE", newloc);
+ } else {
+ fprintf(stderr, "Locale coercion must target LC_ALL or LC_CTYPE\n");
+ return;
+ }
+
+ /* Set PYTHONIOENCODING if not already set */
+ if (setenv("PYTHONIOENCODING", "utf-8:surrogateescape", 0)) {
+ fprintf(stderr,
+ "Error setting PYTHONIOENCODING during C locale coercion\n");
+ }
+
+ /* Reconfigure with the overridden environment variables */
+ setlocale(LC_ALL, "");
+}
+
+void
+_handle_legacy_c_locale(void)
+{
+ const char *coerce_c_locale = getenv("PYTHONCOERCECLOCALE");
+ /* We ignore the Python -E and -I flags here, as we need to sort out
+ * the locale settings *before* we try to do anything with the command
+ * line arguments. For cross-platform debugging purposes, we also need
+ * to give end users a way to force even scripts that are otherwise
+ * isolated from their environment to use the legacy ASCII-centric C
+ * locale.
+ */
+ if (coerce_c_locale == NULL || strncmp(coerce_c_locale, "0", 2) != 0) {
+ /* PYTHONCOERCECLOCALE is not set, or is not set to exactly "0" */
+ const _LocaleCoercionTarget *target = NULL;
+ for (target = _TARGET_LOCALES; target->locale_name; target++) {
+ const char *reconfigured_locale = setlocale(target->category,
+ target->locale_name);
+ if (reconfigured_locale != NULL) {
+ /* Successfully configured locale, so make it the default */
+ _coerce_default_locale_settings(target);
+ return;
+ }
+ }
+
+ }
+ /* No C locale warning here, as Py_Initialize will emit one later */
+}
+#endif
+
int
main(int argc, char **argv)
{
@@ -49,7 +153,26 @@ main(int argc, char **argv)
return 1;
}
+#ifdef __ANDROID__
+ /* Passing "" to setlocale() on Android requests the C locale rather
+ * than checking environment variables, so request C.UTF-8 explicitly
+ */
+ setlocale(LC_ALL, "C.UTF-8");
+#else
+ /* Reconfigure the locale to the default for this process */
setlocale(LC_ALL, "");
+#endif
+
+#ifdef PY_COERCE_C_LOCALE
+ /* When the LC_CTYPE category still claims to be using the C locale,
+ assume configuration error and try for a UTF-8 based locale instead */
+ const char *ctype_loc = setlocale(LC_CTYPE, NULL);
+ if (ctype_loc != NULL && strcmp(ctype_loc, "C") == 0) {
+ _handle_legacy_c_locale();
+ }
+#endif
+
+ /* Convert from char to wchar_t based on the locale settings */
for (i = 0; i < argc; i++) {
argv_copy[i] = Py_DecodeLocale(argv[i], NULL);
if (!argv_copy[i]) {
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
index a4f7f82..dd58dc9 100644
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@@ -301,6 +301,31 @@ import_init(PyInterpreterState *interp, PyObject *sysmod)
}
+#ifdef PY_WARN_ON_C_LOCALE
+static const char *_C_LOCALE_WARNING =
+ "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII "
+ "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, "
+ "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible "
+ "locales is recommended.\n";
+
+static void
+_emit_stderr_warning_for_c_locale(void)
+{
+ const char *coerce_c_locale = getenv("PYTHONCOERCECLOCALE");
+ /* We don't emit a warning if locale coercion has been explicitly disabled.
+ *
+ * For consistency with the corresponding check in Programs/python.c
+ * we ignore the Python -E and -I flags here.
+ */
+ if (coerce_c_locale == NULL || strncmp(coerce_c_locale, "0", 2) != 0) {
+ const char *ctype_loc = setlocale(LC_CTYPE, NULL);
+ if (ctype_loc != NULL && strcmp(ctype_loc, "C") == 0) {
+ fprintf(stderr, "%s", _C_LOCALE_WARNING);
+ }
+ }
+}
+#endif
+
void
_Py_InitializeEx_Private(int install_sigs, int install_importlib)
{
@@ -315,11 +340,19 @@ _Py_InitializeEx_Private(int install_sigs, int install_importlib)
initialized = 1;
_Py_Finalizing = NULL;
-#ifdef HAVE_SETLOCALE
+#ifdef __ANDROID__
+ /* Passing "" to setlocale() on Android requests the C locale rather
+ * than checking environment variables, so request C.UTF-8 explicitly
+ */
+ setlocale(LC_CTYPE, "C.UTF-8");
+#else
/* Set up the LC_CTYPE locale, so we can obtain
the locale's charset without having to switch
locales. */
setlocale(LC_CTYPE, "");
+#ifdef PY_WARN_ON_C_LOCALE
+ _emit_stderr_warning_for_c_locale();
+#endif
#endif
if ((p = Py_GETENV("PYTHONDEBUG")) && *p != '\0')
diff --git a/configure b/configure
index 6bcddb7..13052d6 100755
--- a/configure
+++ b/configure
@@ -834,6 +834,8 @@ with_thread
enable_ipv6
with_doc_strings
with_pymalloc
+with_c_locale_coercion
+with_c_locale_warning
with_valgrind
with_dtrace
with_fpectl
@@ -1527,6 +1529,12 @@ Optional Packages:
deprecated; use --with(out)-threads
--with(out)-doc-strings disable/enable documentation strings
--with(out)-pymalloc disable/enable specialized mallocs
+ --with(out)-c-locale-coercion
+ disable/enable C locale coercion to a UTF-8 based
+ locale
+ --with(out)-c-locale-warning
+ disable/enable locale compatibility warning in the C
+ locale
--with-valgrind Enable Valgrind support
--with(out)-dtrace disable/enable DTrace support
--with-fpectl enable SIGFPE catching
@@ -11016,6 +11024,52 @@ fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_pymalloc" >&5
$as_echo "$with_pymalloc" >&6; }
+# Check for --with-c-locale-coercion
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for --with-c-locale-coercion" >&5
+$as_echo_n "checking for --with-c-locale-coercion... " >&6; }
+
+# Check whether --with-c-locale-coercion was given.
+if test "${with_c_locale_coercion+set}" = set; then :
+ withval=$with_c_locale_coercion;
+fi
+
+
+if test -z "$with_c_locale_coercion"
+then
+ with_c_locale_coercion="yes"
+fi
+if test "$with_c_locale_coercion" != "no"
+then
+
+$as_echo "#define PY_COERCE_C_LOCALE 1" >>confdefs.h
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_c_locale_coercion" >&5
+$as_echo "$with_c_locale_coercion" >&6; }
+
+# Check for --with-c-locale-warning
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for --with-c-locale-warning" >&5
+$as_echo_n "checking for --with-c-locale-warning... " >&6; }
+
+# Check whether --with-c-locale-warning was given.
+if test "${with_c_locale_warning+set}" = set; then :
+ withval=$with_c_locale_warning;
+fi
+
+
+if test -z "$with_c_locale_warning"
+then
+ with_c_locale_warning="yes"
+fi
+if test "$with_c_locale_warning" != "no"
+then
+
+$as_echo "#define PY_WARN_ON_C_LOCALE 1" >>confdefs.h
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_c_locale_warning" >&5
+$as_echo "$with_c_locale_warning" >&6; }
+
# Check for Valgrind support
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for --with-valgrind" >&5
$as_echo_n "checking for --with-valgrind... " >&6; }
diff --git a/configure.ac b/configure.ac
index e222c21..a1653e7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3287,6 +3287,40 @@ then
fi
AC_MSG_RESULT($with_pymalloc)
+# Check for --with-c-locale-coercion
+AC_MSG_CHECKING(for --with-c-locale-coercion)
+AC_ARG_WITH(c-locale-coercion,
+ AS_HELP_STRING([--with(out)-c-locale-coercion],
+ [disable/enable C locale coercion to a UTF-8 based locale]))
+
+if test -z "$with_c_locale_coercion"
+then
+ with_c_locale_coercion="yes"
+fi
+if test "$with_c_locale_coercion" != "no"
+then
+ AC_DEFINE(PY_COERCE_C_LOCALE, 1,
+ [Define if you want to coerce the C locale to a UTF-8 based locale])
+fi
+AC_MSG_RESULT($with_c_locale_coercion)
+
+# Check for --with-c-locale-warning
+AC_MSG_CHECKING(for --with-c-locale-warning)
+AC_ARG_WITH(c-locale-warning,
+ AS_HELP_STRING([--with(out)-c-locale-warning],
+ [disable/enable locale compatibility warning in the C locale]))
+
+if test -z "$with_c_locale_warning"
+then
+ with_c_locale_warning="yes"
+fi
+if test "$with_c_locale_warning" != "no"
+then
+ AC_DEFINE(PY_WARN_ON_C_LOCALE, 1,
+ [Define to emit a locale compatibility warning in the C locale])
+fi
+AC_MSG_RESULT($with_c_locale_warning)
+
# Check for Valgrind support
AC_MSG_CHECKING([for --with-valgrind])
AC_ARG_WITH([valgrind],
diff --git a/pyconfig.h.in b/pyconfig.h.in
index e7a836c..11e0798 100644
--- a/pyconfig.h.in
+++ b/pyconfig.h.in
@@ -1241,9 +1241,15 @@
/* Define as the preferred size in bits of long digits */
#undef PYLONG_BITS_IN_DIGIT
+/* Define if you want to coerce the C locale to a UTF-8 based locale */
+#undef PY_COERCE_C_LOCALE
+
/* Define to printf format modifier for Py_ssize_t */
#undef PY_FORMAT_SIZE_T
+/* Define to emit a locale compatibility warning in the C locale */
+#undef PY_WARN_ON_C_LOCALE
+
/* Define if you want to build an interpreter with many run-time checks. */
#undef Py_DEBUG