Fix script run matching to allow ASCII digits in scripts that use their own in addition

2018-11-30 13:23:57 +01:00 · 2018-11-30 13:23:57 +01:00 · a23ff061c5
commit a23ff061c5
parent 16261479ab
3 changed files with 251 additions and 1 deletions
--- a/perl-5.28.1-PATCH-perl-133547-script-run-broken.patch
+++ b/perl-5.28.1-PATCH-perl-133547-script-run-broken.patch
@ -0,0 +1,184 @@
+From a824afe95b6272148dce1f8bf4bcd20a667412e6 Mon Sep 17 00:00:00 2001
+From: Karl Williamson <khw@cpan.org>
+Date: Sun, 30 Sep 2018 10:38:02 -0600
+Subject: [PATCH] PATCH: [perl #133547]: script run broken
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+All scripts can have the ASCII digits for their numbers.  Scripts with
+their own digits can alternatively use those.  Only one of these two
+sets can be used in a script run.  The decision as to which set to use
+must be deferred until the first digit is encountered, as otherwise we
+don't know which set will be used.  Prior to this commit, the decision
+was being made prematurely in some cases.  As a result of this change,
+the non-ASCII-digits in the Common script need to be special-cased, and
+different criteria are used to decide if we need to look up whether a
+character is a digit or not.
+
+Petr Písař: Ported to 5.28.1 from
+393e5a4585b92e635cfc4eee34da8f86f3bfd2af.
+
+Signed-off-by: Petr Písař <ppisar@redhat.com>
+---
+ regexec.c         | 111 +++++++++++++++++++++++-----------------------
+ t/re/script_run.t |   5 +++
+ 2 files changed, 61 insertions(+), 55 deletions(-)
+
+diff --git a/regexec.c b/regexec.c
+index 899d979..201d9aa 100644
+--- a/regexec.c
+++ b/regexec.c
+@@ -10323,6 +10323,10 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
+ 
+     /* Look at each character in the sequence */
+     while (s < send) {
+        /* If the current character being examined is a digit, this is the code
+         * point of the zero for its sequence of 10 */
+        UV zero_of_char;
+
+         UV cp;
+ 
+         /* The code allows all scripts to use the ASCII digits.  This is
+@@ -10434,16 +10438,6 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
+             script_of_run = script_of_char;
+         }
+ 
+-        /* All decimal digits must be from the same sequence of 10.  Above, we
+-         * handled any ASCII digits without descending to here.  We also
+-         * handled the case where we already knew what digit sequence is the
+-         * one to use, and the character is in that sequence.  Now that we know
+-         * the script, we can use script_zeros[] to directly find which
+-         * sequence the script uses, except in a few cases it returns 0 */
+-        if (UNLIKELY(zero_of_run == 0) && script_of_char >= 0) {
+-            zero_of_run = script_zeros[script_of_char];
+-        }
+-
+         /* Now we can see if the script of the character is the same as that of
+          * the run */
+         if (LIKELY(script_of_char == script_of_run)) {
+@@ -10601,55 +10595,62 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
+         /* Here, the script of the character is compatible with that of the
+          * run.  That means that in most cases, it continues the script run.
+          * Either it and the run match exactly, or one or both can be in any of
+-         * several scripts, and the intersection is not empty.  But if the
+-         * character is a decimal digit, we need further handling.  If we
+-         * haven't seen a digit before, it would establish what set of 10 all
+-         * must come from; and if we have established a set, we need to check
+-         * that this is in it.
+-         *
+-         * But there are cases we can rule out without having to look up if
+-         * this is a digit:
+-         *   a.  All instances of [0-9] have been dealt with earlier.
+-         *   b.  The next digit encoded by Unicode is 1600 code points further
+-         *       on, so if the code point in this loop iteration is less than
+-         *       that, it isn't a digit.
+-         *   c.  Most scripts that have digits have a single set of 10.  If
+-         *       we've encountered a digit in such a script, 'zero_of_run' is
+-         *       set to the code point (call it z) whose numeric value is 0.
+-         *       If the code point in this loop iteration is in the range
+-         *       z..z+9, it is in the script's set of 10, and we've actually
+-         *       handled it earlier in this function and won't reach this
+-         *       point.  But, code points in that script that aren't in that
+-         *       range can't be digits, so we don't have to look any such up.
+-         *       We can tell if this script is such a one by looking at
+-         *       'script_zeros[]' for it.  It is non-zero iff it has a single
+-         *       set of digits.  This rule doesn't apply if we haven't narrowed
+-         *       down the possible scripts to a single one yet.  Nor if the
+-         *       zero of the run is '0', as that also hasn't narrowed things
+-         *       down completely */
+-        if (    cp >= FIRST_NON_ASCII_DECIMAL_DIGIT
+-            && (   intersection
+-                || script_of_char < 0   /* Also implies an intersection */
+-                || zero_of_run == '0'
+-                || script_zeros[script_of_char] == 0))
+         * several scripts, and the intersection is not empty.  However, if the
+         * character is a decimal digit, it could still mean failure if it is
+         * from the wrong sequence of 10.  So, we need to look at if it's a
+         * digit.  We've already handled the 10 decimal digits, and the next
+         * lowest one is this one: */
+        if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) {
+            continue;   /* Not a digit; this character is part of the run */
+        }
+
+        /* If we have a definitive '0' for the script of this character, we
+         * know that for this to be a digit, it must be in the range of +0..+9
+         * of that zero. */
+        if (   script_of_char >= 0
+            && (zero_of_char = script_zeros[script_of_char]))
+         {
+-            SSize_t zero_of_char_index;
+-            zero_of_char_index = _invlist_search(decimals_invlist, cp);
+-            if (   LIKELY(zero_of_char_index >= 0)
+-                && ELEMENT_RANGE_MATCHES_INVLIST(zero_of_char_index))
+            if (   cp < zero_of_char
+                || cp > zero_of_char + 9)
+             {
+-                UV zero_of_char = decimals_array[zero_of_char_index];
+-                if (zero_of_run) {
+-                    if (zero_of_run != zero_of_char) {
+-                        retval = FALSE;
+-                        break;
+-                    }
+-                }
+-                else {
+-                    zero_of_run = zero_of_char;
+-                }
+                continue;   /* Not a digit; this character is part of the run
+                             */
+            }
+
+        }
+        else {  /* Need to look up if this character is a digit or not */
+            SSize_t index_of_zero_of_char;
+            index_of_zero_of_char = _invlist_search(decimals_invlist, cp);
+            if (     UNLIKELY(index_of_zero_of_char < 0)
+                || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char))
+            {
+                continue;   /* Not a digit; this character is part of the run.
+                             */
+            }
+
+            zero_of_char = decimals_array[index_of_zero_of_char];
+        }
+
+        /* Here, the character is a decimal digit, and the zero of its sequence
+         * of 10 is in 'zero_of_char'.  If we already have a zero for this run,
+         * they better be the same. */
+        if (zero_of_run) {
+            if (zero_of_run != zero_of_char) {
+                retval = FALSE;
+                break;
+             }
+         }
+        else if (script_of_char == SCX_Common && script_of_run != SCX_Common) {
+
+            /* Here, the script run isn't Common, but the current digit is in
+             * Common, and isn't '0'-'9' (those were handled earlier).   Only
+             * '0'-'9' are acceptable in non-Common scripts. */
+            retval = FALSE;
+            break;
+        }
+        else {  /* Otherwise we now have a zero for this run */
+            zero_of_run = zero_of_char;
+        }
+     } /* end of looping through CLOSESR text */
+ 
+     Safefree(intersection);
+diff --git a/t/re/script_run.t b/t/re/script_run.t
+index 10c7103..f8809e3 100644
+--- a/t/re/script_run.t
+++ b/t/re/script_run.t
+@@ -97,4 +97,9 @@ foreach my $type ('script_run', 'sr', 'atomic_script_run', 'asr') {
+       like("abc", qr/(*asr:a[bc]*c)/, "Outer asr works on a run");
+     unlike("abc", qr/(*asr:a(*asr:[bc]*)c)/, "Nested asr works to exclude some things");
+ 
+    like("\x{0980}12\x{0993}", qr/^(*sr:.{4})/,
+         "Script with own zero works with ASCII digits"); # perl #133547
+    like("\x{3041}12\x{3041}", qr/^(*sr:.{4})/,
+         "Script without own zero works with ASCII digits");
+
+ done_testing();
+-- 
+2.17.2
+
--- a/perl-5.28.1-regexec.c-Rename-variable.patch
+++ b/perl-5.28.1-regexec.c-Rename-variable.patch
@ -0,0 +1,54 @@
+From 152f5a590ad349922cc90e3e867a599eced7fada Mon Sep 17 00:00:00 2001
+From: Karl Williamson <khw@cpan.org>
+Date: Sun, 30 Sep 2018 10:33:22 -0600
+Subject: [PATCH] regexec.c: Rename variable
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+The new name is clearer as to its meaning, more so after the next
+commit.
+
+Petr Písař: Ported to 5.28.1 from
+81ec018c6daca2b4c8c87eb335a371b4c90753f3.
+
+Signed-off-by: Petr Písař <ppisar@redhat.com>
+---
+ regexec.c | 14 +++++++-------
+ 1 file changed, 7 insertions(+), 7 deletions(-)
+
+diff --git a/regexec.c b/regexec.c
+index d1a3937..899d979 100644
+--- a/regexec.c
+++ b/regexec.c
+@@ -10633,20 +10633,20 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
+                 || zero_of_run == '0'
+                 || script_zeros[script_of_char] == 0))
+         {
+-            SSize_t range_zero_index;
+-            range_zero_index = _invlist_search(decimals_invlist, cp);
+-            if (   LIKELY(range_zero_index >= 0)
+-                && ELEMENT_RANGE_MATCHES_INVLIST(range_zero_index))
+            SSize_t zero_of_char_index;
+            zero_of_char_index = _invlist_search(decimals_invlist, cp);
+            if (   LIKELY(zero_of_char_index >= 0)
+                && ELEMENT_RANGE_MATCHES_INVLIST(zero_of_char_index))
+             {
+-                UV range_zero = decimals_array[range_zero_index];
+                UV zero_of_char = decimals_array[zero_of_char_index];
+                 if (zero_of_run) {
+-                    if (zero_of_run != range_zero) {
+                    if (zero_of_run != zero_of_char) {
+                         retval = FALSE;
+                         break;
+                     }
+                 }
+                 else {
+-                    zero_of_run = range_zero;
+                    zero_of_run = zero_of_char;
+                 }
+             }
+         }
+-- 
+2.17.2
+
--- a/perl.spec
+++ b/perl.spec
@ -81,7 +81,7 @@ License:        GPL+ or Artistic
 Epoch:          %{perl_epoch}
 Version:        %{perl_version}
 # release number must be even higher, because dual-lived modules will be broken otherwise
-Release:        426%{?dist}
+Release:        427%{?dist}
 Summary:        Practical Extraction and Report Language
 Url:            https://www.perl.org/
 Source0:        https://www.cpan.org/src/5.0/perl-%{perl_version}.tar.xz
@ -193,6 +193,11 @@ Patch29:        perl-5.29.2-perl-132655-nul-terminate-result-of-unpack-u-of-inva
 # Pass the correct CFLAGS to dtrace
 Patch30:        perl-5.28.0-Pass-CFLAGS-to-dtrace.patch

+# Fix script run matching to allow ASCII digits in scripts that use their own in
+# addition, RT#133547, in upstream after 5.29.3
+Patch31:        perl-5.28.1-regexec.c-Rename-variable.patch
+Patch32:        perl-5.28.1-PATCH-perl-133547-script-run-broken.patch
+
 # Link XS modules to libperl.so with EU::CBuilder on Linux, bug #960048
 Patch200:       perl-5.16.3-Link-XS-modules-to-libperl.so-with-EU-CBuilder-on-Li.patch

@ -2774,6 +2779,8 @@ Perl extension for Version Objects
 %patch28 -p1
 %patch29 -p1
 %patch30 -p1
+%patch31 -p1
+%patch32 -p1
 %patch200 -p1
 %patch201 -p1

@ -2806,6 +2813,7 @@ perl -x patchlevel.h \
    'Fedora Patch27: Fix an assignment to a lexical variable in multiconcatenation expressions (RT#133441)' \
    'Fedora Patch28: Fix a spurious warning about uninitialized value in warn (RT#132683)' \
    'Fedora Patch30: Pass the correct CFLAGS to dtrace' \
+    'Fedora Patch31: Fix script run matching to allow ASCII digits in scripts that use their own in addition (RT#133547)' \
    'Fedora Patch200: Link XS modules to libperl.so with EU::CBuilder on Linux' \
    'Fedora Patch201: Link XS modules to libperl.so with EU::MM on Linux' \
    %{nil}
@ -5094,6 +5102,10 @@ popd

 # Old changelog entries are preserved in CVS.
 %changelog
+* Fri Nov 30 2018 Petr Pisar <ppisar@redhat.com> - 4:5.28.1-427
+- Fix script run matching to allow ASCII digits in scripts that use their own in
+  addition (RT#133547)
+
 * Fri Nov 30 2018 Jitka Plesnikova <jplesnik@redhat.com> - 4:5.28.1-426
 - 5.28.1 bump
 - Fix CVE-2018-18312 (heap-buffer-overflow write in regcomp.c)