From a23ff061c53e7b330bf744e80a575d007d70d75e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20P=C3=ADsa=C5=99?= Date: Fri, 30 Nov 2018 13:23:57 +0100 Subject: [PATCH] Fix script run matching to allow ASCII digits in scripts that use their own in addition --- ...-PATCH-perl-133547-script-run-broken.patch | 184 ++++++++++++++++++ perl-5.28.1-regexec.c-Rename-variable.patch | 54 +++++ perl.spec | 14 +- 3 files changed, 251 insertions(+), 1 deletion(-) create mode 100644 perl-5.28.1-PATCH-perl-133547-script-run-broken.patch create mode 100644 perl-5.28.1-regexec.c-Rename-variable.patch diff --git a/perl-5.28.1-PATCH-perl-133547-script-run-broken.patch b/perl-5.28.1-PATCH-perl-133547-script-run-broken.patch new file mode 100644 index 0000000..e81e2af --- /dev/null +++ b/perl-5.28.1-PATCH-perl-133547-script-run-broken.patch @@ -0,0 +1,184 @@ +From a824afe95b6272148dce1f8bf4bcd20a667412e6 Mon Sep 17 00:00:00 2001 +From: Karl Williamson +Date: Sun, 30 Sep 2018 10:38:02 -0600 +Subject: [PATCH] PATCH: [perl #133547]: script run broken +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +All scripts can have the ASCII digits for their numbers. Scripts with +their own digits can alternatively use those. Only one of these two +sets can be used in a script run. The decision as to which set to use +must be deferred until the first digit is encountered, as otherwise we +don't know which set will be used. Prior to this commit, the decision +was being made prematurely in some cases. As a result of this change, +the non-ASCII-digits in the Common script need to be special-cased, and +different criteria are used to decide if we need to look up whether a +character is a digit or not. + +Petr Písař: Ported to 5.28.1 from +393e5a4585b92e635cfc4eee34da8f86f3bfd2af. + +Signed-off-by: Petr Písař +--- + regexec.c | 111 +++++++++++++++++++++++----------------------- + t/re/script_run.t | 5 +++ + 2 files changed, 61 insertions(+), 55 deletions(-) + +diff --git a/regexec.c b/regexec.c +index 899d979..201d9aa 100644 +--- a/regexec.c ++++ b/regexec.c +@@ -10323,6 +10323,10 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target) + + /* Look at each character in the sequence */ + while (s < send) { ++ /* If the current character being examined is a digit, this is the code ++ * point of the zero for its sequence of 10 */ ++ UV zero_of_char; ++ + UV cp; + + /* The code allows all scripts to use the ASCII digits. This is +@@ -10434,16 +10438,6 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target) + script_of_run = script_of_char; + } + +- /* All decimal digits must be from the same sequence of 10. Above, we +- * handled any ASCII digits without descending to here. We also +- * handled the case where we already knew what digit sequence is the +- * one to use, and the character is in that sequence. Now that we know +- * the script, we can use script_zeros[] to directly find which +- * sequence the script uses, except in a few cases it returns 0 */ +- if (UNLIKELY(zero_of_run == 0) && script_of_char >= 0) { +- zero_of_run = script_zeros[script_of_char]; +- } +- + /* Now we can see if the script of the character is the same as that of + * the run */ + if (LIKELY(script_of_char == script_of_run)) { +@@ -10601,55 +10595,62 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target) + /* Here, the script of the character is compatible with that of the + * run. That means that in most cases, it continues the script run. + * Either it and the run match exactly, or one or both can be in any of +- * several scripts, and the intersection is not empty. But if the +- * character is a decimal digit, we need further handling. If we +- * haven't seen a digit before, it would establish what set of 10 all +- * must come from; and if we have established a set, we need to check +- * that this is in it. +- * +- * But there are cases we can rule out without having to look up if +- * this is a digit: +- * a. All instances of [0-9] have been dealt with earlier. +- * b. The next digit encoded by Unicode is 1600 code points further +- * on, so if the code point in this loop iteration is less than +- * that, it isn't a digit. +- * c. Most scripts that have digits have a single set of 10. If +- * we've encountered a digit in such a script, 'zero_of_run' is +- * set to the code point (call it z) whose numeric value is 0. +- * If the code point in this loop iteration is in the range +- * z..z+9, it is in the script's set of 10, and we've actually +- * handled it earlier in this function and won't reach this +- * point. But, code points in that script that aren't in that +- * range can't be digits, so we don't have to look any such up. +- * We can tell if this script is such a one by looking at +- * 'script_zeros[]' for it. It is non-zero iff it has a single +- * set of digits. This rule doesn't apply if we haven't narrowed +- * down the possible scripts to a single one yet. Nor if the +- * zero of the run is '0', as that also hasn't narrowed things +- * down completely */ +- if ( cp >= FIRST_NON_ASCII_DECIMAL_DIGIT +- && ( intersection +- || script_of_char < 0 /* Also implies an intersection */ +- || zero_of_run == '0' +- || script_zeros[script_of_char] == 0)) ++ * several scripts, and the intersection is not empty. However, if the ++ * character is a decimal digit, it could still mean failure if it is ++ * from the wrong sequence of 10. So, we need to look at if it's a ++ * digit. We've already handled the 10 decimal digits, and the next ++ * lowest one is this one: */ ++ if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) { ++ continue; /* Not a digit; this character is part of the run */ ++ } ++ ++ /* If we have a definitive '0' for the script of this character, we ++ * know that for this to be a digit, it must be in the range of +0..+9 ++ * of that zero. */ ++ if ( script_of_char >= 0 ++ && (zero_of_char = script_zeros[script_of_char])) + { +- SSize_t zero_of_char_index; +- zero_of_char_index = _invlist_search(decimals_invlist, cp); +- if ( LIKELY(zero_of_char_index >= 0) +- && ELEMENT_RANGE_MATCHES_INVLIST(zero_of_char_index)) ++ if ( cp < zero_of_char ++ || cp > zero_of_char + 9) + { +- UV zero_of_char = decimals_array[zero_of_char_index]; +- if (zero_of_run) { +- if (zero_of_run != zero_of_char) { +- retval = FALSE; +- break; +- } +- } +- else { +- zero_of_run = zero_of_char; +- } ++ continue; /* Not a digit; this character is part of the run ++ */ ++ } ++ ++ } ++ else { /* Need to look up if this character is a digit or not */ ++ SSize_t index_of_zero_of_char; ++ index_of_zero_of_char = _invlist_search(decimals_invlist, cp); ++ if ( UNLIKELY(index_of_zero_of_char < 0) ++ || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char)) ++ { ++ continue; /* Not a digit; this character is part of the run. ++ */ ++ } ++ ++ zero_of_char = decimals_array[index_of_zero_of_char]; ++ } ++ ++ /* Here, the character is a decimal digit, and the zero of its sequence ++ * of 10 is in 'zero_of_char'. If we already have a zero for this run, ++ * they better be the same. */ ++ if (zero_of_run) { ++ if (zero_of_run != zero_of_char) { ++ retval = FALSE; ++ break; + } + } ++ else if (script_of_char == SCX_Common && script_of_run != SCX_Common) { ++ ++ /* Here, the script run isn't Common, but the current digit is in ++ * Common, and isn't '0'-'9' (those were handled earlier). Only ++ * '0'-'9' are acceptable in non-Common scripts. */ ++ retval = FALSE; ++ break; ++ } ++ else { /* Otherwise we now have a zero for this run */ ++ zero_of_run = zero_of_char; ++ } + } /* end of looping through CLOSESR text */ + + Safefree(intersection); +diff --git a/t/re/script_run.t b/t/re/script_run.t +index 10c7103..f8809e3 100644 +--- a/t/re/script_run.t ++++ b/t/re/script_run.t +@@ -97,4 +97,9 @@ foreach my $type ('script_run', 'sr', 'atomic_script_run', 'asr') { + like("abc", qr/(*asr:a[bc]*c)/, "Outer asr works on a run"); + unlike("abc", qr/(*asr:a(*asr:[bc]*)c)/, "Nested asr works to exclude some things"); + ++ like("\x{0980}12\x{0993}", qr/^(*sr:.{4})/, ++ "Script with own zero works with ASCII digits"); # perl #133547 ++ like("\x{3041}12\x{3041}", qr/^(*sr:.{4})/, ++ "Script without own zero works with ASCII digits"); ++ + done_testing(); +-- +2.17.2 + diff --git a/perl-5.28.1-regexec.c-Rename-variable.patch b/perl-5.28.1-regexec.c-Rename-variable.patch new file mode 100644 index 0000000..a7f5685 --- /dev/null +++ b/perl-5.28.1-regexec.c-Rename-variable.patch @@ -0,0 +1,54 @@ +From 152f5a590ad349922cc90e3e867a599eced7fada Mon Sep 17 00:00:00 2001 +From: Karl Williamson +Date: Sun, 30 Sep 2018 10:33:22 -0600 +Subject: [PATCH] regexec.c: Rename variable +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The new name is clearer as to its meaning, more so after the next +commit. + +Petr Písař: Ported to 5.28.1 from +81ec018c6daca2b4c8c87eb335a371b4c90753f3. + +Signed-off-by: Petr Písař +--- + regexec.c | 14 +++++++------- + 1 file changed, 7 insertions(+), 7 deletions(-) + +diff --git a/regexec.c b/regexec.c +index d1a3937..899d979 100644 +--- a/regexec.c ++++ b/regexec.c +@@ -10633,20 +10633,20 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target) + || zero_of_run == '0' + || script_zeros[script_of_char] == 0)) + { +- SSize_t range_zero_index; +- range_zero_index = _invlist_search(decimals_invlist, cp); +- if ( LIKELY(range_zero_index >= 0) +- && ELEMENT_RANGE_MATCHES_INVLIST(range_zero_index)) ++ SSize_t zero_of_char_index; ++ zero_of_char_index = _invlist_search(decimals_invlist, cp); ++ if ( LIKELY(zero_of_char_index >= 0) ++ && ELEMENT_RANGE_MATCHES_INVLIST(zero_of_char_index)) + { +- UV range_zero = decimals_array[range_zero_index]; ++ UV zero_of_char = decimals_array[zero_of_char_index]; + if (zero_of_run) { +- if (zero_of_run != range_zero) { ++ if (zero_of_run != zero_of_char) { + retval = FALSE; + break; + } + } + else { +- zero_of_run = range_zero; ++ zero_of_run = zero_of_char; + } + } + } +-- +2.17.2 + diff --git a/perl.spec b/perl.spec index e5fdd11..78d61d0 100644 --- a/perl.spec +++ b/perl.spec @@ -81,7 +81,7 @@ License: GPL+ or Artistic Epoch: %{perl_epoch} Version: %{perl_version} # release number must be even higher, because dual-lived modules will be broken otherwise -Release: 426%{?dist} +Release: 427%{?dist} Summary: Practical Extraction and Report Language Url: https://www.perl.org/ Source0: https://www.cpan.org/src/5.0/perl-%{perl_version}.tar.xz @@ -193,6 +193,11 @@ Patch29: perl-5.29.2-perl-132655-nul-terminate-result-of-unpack-u-of-inva # Pass the correct CFLAGS to dtrace Patch30: perl-5.28.0-Pass-CFLAGS-to-dtrace.patch +# Fix script run matching to allow ASCII digits in scripts that use their own in +# addition, RT#133547, in upstream after 5.29.3 +Patch31: perl-5.28.1-regexec.c-Rename-variable.patch +Patch32: perl-5.28.1-PATCH-perl-133547-script-run-broken.patch + # Link XS modules to libperl.so with EU::CBuilder on Linux, bug #960048 Patch200: perl-5.16.3-Link-XS-modules-to-libperl.so-with-EU-CBuilder-on-Li.patch @@ -2774,6 +2779,8 @@ Perl extension for Version Objects %patch28 -p1 %patch29 -p1 %patch30 -p1 +%patch31 -p1 +%patch32 -p1 %patch200 -p1 %patch201 -p1 @@ -2806,6 +2813,7 @@ perl -x patchlevel.h \ 'Fedora Patch27: Fix an assignment to a lexical variable in multiconcatenation expressions (RT#133441)' \ 'Fedora Patch28: Fix a spurious warning about uninitialized value in warn (RT#132683)' \ 'Fedora Patch30: Pass the correct CFLAGS to dtrace' \ + 'Fedora Patch31: Fix script run matching to allow ASCII digits in scripts that use their own in addition (RT#133547)' \ 'Fedora Patch200: Link XS modules to libperl.so with EU::CBuilder on Linux' \ 'Fedora Patch201: Link XS modules to libperl.so with EU::MM on Linux' \ %{nil} @@ -5094,6 +5102,10 @@ popd # Old changelog entries are preserved in CVS. %changelog +* Fri Nov 30 2018 Petr Pisar - 4:5.28.1-427 +- Fix script run matching to allow ASCII digits in scripts that use their own in + addition (RT#133547) + * Fri Nov 30 2018 Jitka Plesnikova - 4:5.28.1-426 - 5.28.1 bump - Fix CVE-2018-18312 (heap-buffer-overflow write in regcomp.c)