glibc/glibc-collation-cldr-12.patch

138 lines
6.8 KiB
Diff
Raw Blame History

commit cc5351f2c0502826f8b4143f3646d44e334ff7b8
Author: Mike FABIAN <mfabian@redhat.com>
Date: Tue Jan 23 17:29:36 2018 +0100
Fix test cases tst-fnmatch and tst-regexloc for the new iso14651_t1_common file.
See:
http://pubs.opengroup.org/onlinepubs/7908799/xbd/re.html
> A range expression represents the set of collating elements that fall
> between two elements in the current collation sequence,
> inclusively. It is expressed as the starting point and the ending
> point separated by a hyphen (-).
>
> Range expressions must not be used in portable applications because
> their behaviour is dependent on the collating sequence. Ranges will be
> treated according to the current collating sequence, and include such
> characters that fall within the range based on that collating
> sequence, regardless of character values. This, however, means that
> the interpretation will differ depending on collating sequence. If,
> for instance, one collating sequence defines ä as a variant of a,
> while another defines it as a letter following z, then the expression
> [ä-z] is valid in the first language and invalid in the second.
Therefore, using [a-z] does not make much sense except in the C/POSIX locale.
The new iso14651_t1_common lists upper case and lower case Latin characters
in a different order than the old one which causes surprising results
for example in the de_DE locale: [a-z] now includes A because A comes
after a in iso14651_t1_common but does not include Z because that comes
after z in iso14651_t1_common.
* posix/tst-fnmatch.input: Fix results for range expressions
for non C locales.
* posix/tst-regexloc.c: Do not use a range expression for
de_DE.ISO-8859-1 locale.
diff --git a/posix/tst-fnmatch.input b/posix/tst-fnmatch.input
index 88b3f739a59333d5..589fb2a94038dbe3 100644
--- a/posix/tst-fnmatch.input
+++ b/posix/tst-fnmatch.input
@@ -418,21 +418,47 @@ C "-" "[Z-\\]]" NOMATCH
# Following are tests outside the scope of IEEE 2003.2 since they are using
# locales other than the C locale. The main focus of the tests is on the
# handling of ranges and the recognition of character (vs bytes).
+#
+# See:
+#
+# http://pubs.opengroup.org/onlinepubs/7908799/xbd/re.html
+#
+# > A range expression represents the set of collating elements that fall
+# > between two elements in the current collation sequence,
+# > inclusively. It is expressed as the starting point and the ending
+# > point separated by a hyphen (-).
+# >
+# > Range expressions must not be used in portable applications because
+# > their behaviour is dependent on the collating sequence. Ranges will be
+# > treated according to the current collating sequence, and include such
+# > characters that fall within the range based on that collating
+# > sequence, regardless of character values. This, however, means that
+# > the interpretation will differ depending on collating sequence. If,
+# > for instance, one collating sequence defines ä as a variant of a,
+# > while another defines it as a letter following z, then the expression
+# > [ä-z] is valid in the first language and invalid in the second.
+#
+# Therefore, using [a-z] does not make much sense except in the C/POSIX locale.
+# The new iso14651_t1_common lists upper case and lower case Latin characters
+# in a different order than the old one which causes surprising results
+# for example in the de_DE locale: [a-z] now includes A because A comes
+# after a in iso14651_t1_common but does not include Z because that comes
+# after z in iso14651_t1_common.
de_DE.ISO-8859-1 "a" "[a-z]" 0
de_DE.ISO-8859-1 "z" "[a-z]" 0
de_DE.ISO-8859-1 "<22>" "[a-z]" 0
de_DE.ISO-8859-1 "<22>" "[a-z]" 0
de_DE.ISO-8859-1 "<22>" "[a-z]" 0
-de_DE.ISO-8859-1 "A" "[a-z]" NOMATCH
+de_DE.ISO-8859-1 "A" "[a-z]" 0 # surprising but correct!
de_DE.ISO-8859-1 "Z" "[a-z]" NOMATCH
-de_DE.ISO-8859-1 "<22>" "[a-z]" NOMATCH
-de_DE.ISO-8859-1 "<22>" "[a-z]" NOMATCH
-de_DE.ISO-8859-1 "<22>" "[a-z]" NOMATCH
+de_DE.ISO-8859-1 "<22>" "[a-z]" 0 # surprising but correct!
+de_DE.ISO-8859-1 "<22>" "[a-z]" 0 # surprising but correct!
+de_DE.ISO-8859-1 "<22>" "[a-z]" 0 # surprising but correct!
de_DE.ISO-8859-1 "a" "[A-Z]" NOMATCH
-de_DE.ISO-8859-1 "z" "[A-Z]" NOMATCH
-de_DE.ISO-8859-1 "<22>" "[A-Z]" NOMATCH
-de_DE.ISO-8859-1 "<22>" "[A-Z]" NOMATCH
-de_DE.ISO-8859-1 "<22>" "[A-Z]" NOMATCH
+de_DE.ISO-8859-1 "z" "[A-Z]" 0 # surprising but correct!
+de_DE.ISO-8859-1 "<22>" "[A-Z]" 0 # surprising but correct!
+de_DE.ISO-8859-1 "<22>" "[A-Z]" 0 # surprising but correct!
+de_DE.ISO-8859-1 "<22>" "[A-Z]" 0 # surprising but correct!
de_DE.ISO-8859-1 "A" "[A-Z]" 0
de_DE.ISO-8859-1 "Z" "[A-Z]" 0
de_DE.ISO-8859-1 "<22>" "[A-Z]" 0
@@ -515,16 +541,16 @@ de_DE.UTF-8 "z" "[a-z]" 0
de_DE.UTF-8 "ä" "[a-z]" 0
de_DE.UTF-8 "ö" "[a-z]" 0
de_DE.UTF-8 "ü" "[a-z]" 0
-de_DE.UTF-8 "A" "[a-z]" NOMATCH
+de_DE.UTF-8 "A" "[a-z]" 0 # surprising but correct!
de_DE.UTF-8 "Z" "[a-z]" NOMATCH
-de_DE.UTF-8 "Ä" "[a-z]" NOMATCH
-de_DE.UTF-8 "Ö" "[a-z]" NOMATCH
-de_DE.UTF-8 "Ü" "[a-z]" NOMATCH
+de_DE.UTF-8 "Ä" "[a-z]" 0 # surprising but correct!
+de_DE.UTF-8 "Ö" "[a-z]" 0 # surprising but correct!
+de_DE.UTF-8 "Ü" "[a-z]" 0 # surprising but correct!
de_DE.UTF-8 "a" "[A-Z]" NOMATCH
-de_DE.UTF-8 "z" "[A-Z]" NOMATCH
-de_DE.UTF-8 "ä" "[A-Z]" NOMATCH
-de_DE.UTF-8 "ö" "[A-Z]" NOMATCH
-de_DE.UTF-8 "ü" "[A-Z]" NOMATCH
+de_DE.UTF-8 "z" "[A-Z]" 0 # surprising but correct!
+de_DE.UTF-8 "ä" "[A-Z]" 0 # surprising but correct!
+de_DE.UTF-8 "ö" "[A-Z]" 0 # surprising but correct!
+de_DE.UTF-8 "ü" "[A-Z]" 0 # surprising but correct!
de_DE.UTF-8 "A" "[A-Z]" 0
de_DE.UTF-8 "Z" "[A-Z]" 0
de_DE.UTF-8 "Ä" "[A-Z]" 0
diff --git a/posix/tst-regexloc.c b/posix/tst-regexloc.c
index 60235b4d3b3e396e..7fbc496d0ce5117f 100644
--- a/posix/tst-regexloc.c
+++ b/posix/tst-regexloc.c
@@ -29,8 +29,8 @@ do_test (void)
if (setlocale (LC_ALL, "de_DE.ISO-8859-1") == NULL)
puts ("cannot set locale");
- else if (regcomp (&re, "[a-f]*", 0) != REG_NOERROR)
- puts ("cannot compile expression \"[a-f]*\"");
+ else if (regcomp (&re, "[abcdef]*", 0) != REG_NOERROR)
+ puts ("cannot compile expression \"[abcdef]*\"");
else if (regexec (&re, "abcdefCDEF", 1, mat, 0) == REG_NOMATCH)
puts ("no match");
else