From a23ff061c53e7b330bf744e80a575d007d70d75e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Petr=20P=C3=ADsa=C5=99?= <ppisar@redhat.com>
Date: Fri, 30 Nov 2018 13:23:57 +0100
Subject: [PATCH] Fix script run matching to allow ASCII digits in scripts that
 use their own in addition

---
 ...-PATCH-perl-133547-script-run-broken.patch | 184 ++++++++++++++++++
 perl-5.28.1-regexec.c-Rename-variable.patch   |  54 +++++
 perl.spec                                     |  14 +-
 3 files changed, 251 insertions(+), 1 deletion(-)
 create mode 100644 perl-5.28.1-PATCH-perl-133547-script-run-broken.patch
 create mode 100644 perl-5.28.1-regexec.c-Rename-variable.patch

diff --git a/perl-5.28.1-PATCH-perl-133547-script-run-broken.patch b/perl-5.28.1-PATCH-perl-133547-script-run-broken.patch
new file mode 100644
index 0000000..e81e2af
--- /dev/null
+++ b/perl-5.28.1-PATCH-perl-133547-script-run-broken.patch
@@ -0,0 +1,184 @@
+From a824afe95b6272148dce1f8bf4bcd20a667412e6 Mon Sep 17 00:00:00 2001
+From: Karl Williamson <khw@cpan.org>
+Date: Sun, 30 Sep 2018 10:38:02 -0600
+Subject: [PATCH] PATCH: [perl #133547]: script run broken
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+All scripts can have the ASCII digits for their numbers.  Scripts with
+their own digits can alternatively use those.  Only one of these two
+sets can be used in a script run.  The decision as to which set to use
+must be deferred until the first digit is encountered, as otherwise we
+don't know which set will be used.  Prior to this commit, the decision
+was being made prematurely in some cases.  As a result of this change,
+the non-ASCII-digits in the Common script need to be special-cased, and
+different criteria are used to decide if we need to look up whether a
+character is a digit or not.
+
+Petr Písař: Ported to 5.28.1 from
+393e5a4585b92e635cfc4eee34da8f86f3bfd2af.
+
+Signed-off-by: Petr Písař <ppisar@redhat.com>
+---
+ regexec.c         | 111 +++++++++++++++++++++++-----------------------
+ t/re/script_run.t |   5 +++
+ 2 files changed, 61 insertions(+), 55 deletions(-)
+
+diff --git a/regexec.c b/regexec.c
+index 899d979..201d9aa 100644
+--- a/regexec.c
++++ b/regexec.c
+@@ -10323,6 +10323,10 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
+ 
+     /* Look at each character in the sequence */
+     while (s < send) {
++        /* If the current character being examined is a digit, this is the code
++         * point of the zero for its sequence of 10 */
++        UV zero_of_char;
++
+         UV cp;
+ 
+         /* The code allows all scripts to use the ASCII digits.  This is
+@@ -10434,16 +10438,6 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
+             script_of_run = script_of_char;
+         }
+ 
+-        /* All decimal digits must be from the same sequence of 10.  Above, we
+-         * handled any ASCII digits without descending to here.  We also
+-         * handled the case where we already knew what digit sequence is the
+-         * one to use, and the character is in that sequence.  Now that we know
+-         * the script, we can use script_zeros[] to directly find which
+-         * sequence the script uses, except in a few cases it returns 0 */
+-        if (UNLIKELY(zero_of_run == 0) && script_of_char >= 0) {
+-            zero_of_run = script_zeros[script_of_char];
+-        }
+-
+         /* Now we can see if the script of the character is the same as that of
+          * the run */
+         if (LIKELY(script_of_char == script_of_run)) {
+@@ -10601,55 +10595,62 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
+         /* Here, the script of the character is compatible with that of the
+          * run.  That means that in most cases, it continues the script run.
+          * Either it and the run match exactly, or one or both can be in any of
+-         * several scripts, and the intersection is not empty.  But if the
+-         * character is a decimal digit, we need further handling.  If we
+-         * haven't seen a digit before, it would establish what set of 10 all
+-         * must come from; and if we have established a set, we need to check
+-         * that this is in it.
+-         *
+-         * But there are cases we can rule out without having to look up if
+-         * this is a digit:
+-         *   a.  All instances of [0-9] have been dealt with earlier.
+-         *   b.  The next digit encoded by Unicode is 1600 code points further
+-         *       on, so if the code point in this loop iteration is less than
+-         *       that, it isn't a digit.
+-         *   c.  Most scripts that have digits have a single set of 10.  If
+-         *       we've encountered a digit in such a script, 'zero_of_run' is
+-         *       set to the code point (call it z) whose numeric value is 0.
+-         *       If the code point in this loop iteration is in the range
+-         *       z..z+9, it is in the script's set of 10, and we've actually
+-         *       handled it earlier in this function and won't reach this
+-         *       point.  But, code points in that script that aren't in that
+-         *       range can't be digits, so we don't have to look any such up.
+-         *       We can tell if this script is such a one by looking at
+-         *       'script_zeros[]' for it.  It is non-zero iff it has a single
+-         *       set of digits.  This rule doesn't apply if we haven't narrowed
+-         *       down the possible scripts to a single one yet.  Nor if the
+-         *       zero of the run is '0', as that also hasn't narrowed things
+-         *       down completely */
+-        if (    cp >= FIRST_NON_ASCII_DECIMAL_DIGIT
+-            && (   intersection
+-                || script_of_char < 0   /* Also implies an intersection */
+-                || zero_of_run == '0'
+-                || script_zeros[script_of_char] == 0))
++         * several scripts, and the intersection is not empty.  However, if the
++         * character is a decimal digit, it could still mean failure if it is
++         * from the wrong sequence of 10.  So, we need to look at if it's a
++         * digit.  We've already handled the 10 decimal digits, and the next
++         * lowest one is this one: */
++        if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) {
++            continue;   /* Not a digit; this character is part of the run */
++        }
++
++        /* If we have a definitive '0' for the script of this character, we
++         * know that for this to be a digit, it must be in the range of +0..+9
++         * of that zero. */
++        if (   script_of_char >= 0
++            && (zero_of_char = script_zeros[script_of_char]))
+         {
+-            SSize_t zero_of_char_index;
+-            zero_of_char_index = _invlist_search(decimals_invlist, cp);
+-            if (   LIKELY(zero_of_char_index >= 0)
+-                && ELEMENT_RANGE_MATCHES_INVLIST(zero_of_char_index))
++            if (   cp < zero_of_char
++                || cp > zero_of_char + 9)
+             {
+-                UV zero_of_char = decimals_array[zero_of_char_index];
+-                if (zero_of_run) {
+-                    if (zero_of_run != zero_of_char) {
+-                        retval = FALSE;
+-                        break;
+-                    }
+-                }
+-                else {
+-                    zero_of_run = zero_of_char;
+-                }
++                continue;   /* Not a digit; this character is part of the run
++                             */
++            }
++
++        }
++        else {  /* Need to look up if this character is a digit or not */
++            SSize_t index_of_zero_of_char;
++            index_of_zero_of_char = _invlist_search(decimals_invlist, cp);
++            if (     UNLIKELY(index_of_zero_of_char < 0)
++                || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char))
++            {
++                continue;   /* Not a digit; this character is part of the run.
++                             */
++            }
++
++            zero_of_char = decimals_array[index_of_zero_of_char];
++        }
++
++        /* Here, the character is a decimal digit, and the zero of its sequence
++         * of 10 is in 'zero_of_char'.  If we already have a zero for this run,
++         * they better be the same. */
++        if (zero_of_run) {
++            if (zero_of_run != zero_of_char) {
++                retval = FALSE;
++                break;
+             }
+         }
++        else if (script_of_char == SCX_Common && script_of_run != SCX_Common) {
++
++            /* Here, the script run isn't Common, but the current digit is in
++             * Common, and isn't '0'-'9' (those were handled earlier).   Only
++             * '0'-'9' are acceptable in non-Common scripts. */
++            retval = FALSE;
++            break;
++        }
++        else {  /* Otherwise we now have a zero for this run */
++            zero_of_run = zero_of_char;
++        }
+     } /* end of looping through CLOSESR text */
+ 
+     Safefree(intersection);
+diff --git a/t/re/script_run.t b/t/re/script_run.t
+index 10c7103..f8809e3 100644
+--- a/t/re/script_run.t
++++ b/t/re/script_run.t
+@@ -97,4 +97,9 @@ foreach my $type ('script_run', 'sr', 'atomic_script_run', 'asr') {
+       like("abc", qr/(*asr:a[bc]*c)/, "Outer asr works on a run");
+     unlike("abc", qr/(*asr:a(*asr:[bc]*)c)/, "Nested asr works to exclude some things");
+ 
++    like("\x{0980}12\x{0993}", qr/^(*sr:.{4})/,
++         "Script with own zero works with ASCII digits"); # perl #133547
++    like("\x{3041}12\x{3041}", qr/^(*sr:.{4})/,
++         "Script without own zero works with ASCII digits");
++
+ done_testing();
+-- 
+2.17.2
+
diff --git a/perl-5.28.1-regexec.c-Rename-variable.patch b/perl-5.28.1-regexec.c-Rename-variable.patch
new file mode 100644
index 0000000..a7f5685
--- /dev/null
+++ b/perl-5.28.1-regexec.c-Rename-variable.patch
@@ -0,0 +1,54 @@
+From 152f5a590ad349922cc90e3e867a599eced7fada Mon Sep 17 00:00:00 2001
+From: Karl Williamson <khw@cpan.org>
+Date: Sun, 30 Sep 2018 10:33:22 -0600
+Subject: [PATCH] regexec.c: Rename variable
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+The new name is clearer as to its meaning, more so after the next
+commit.
+
+Petr Písař: Ported to 5.28.1 from
+81ec018c6daca2b4c8c87eb335a371b4c90753f3.
+
+Signed-off-by: Petr Písař <ppisar@redhat.com>
+---
+ regexec.c | 14 +++++++-------
+ 1 file changed, 7 insertions(+), 7 deletions(-)
+
+diff --git a/regexec.c b/regexec.c
+index d1a3937..899d979 100644
+--- a/regexec.c
++++ b/regexec.c
+@@ -10633,20 +10633,20 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
+                 || zero_of_run == '0'
+                 || script_zeros[script_of_char] == 0))
+         {
+-            SSize_t range_zero_index;
+-            range_zero_index = _invlist_search(decimals_invlist, cp);
+-            if (   LIKELY(range_zero_index >= 0)
+-                && ELEMENT_RANGE_MATCHES_INVLIST(range_zero_index))
++            SSize_t zero_of_char_index;
++            zero_of_char_index = _invlist_search(decimals_invlist, cp);
++            if (   LIKELY(zero_of_char_index >= 0)
++                && ELEMENT_RANGE_MATCHES_INVLIST(zero_of_char_index))
+             {
+-                UV range_zero = decimals_array[range_zero_index];
++                UV zero_of_char = decimals_array[zero_of_char_index];
+                 if (zero_of_run) {
+-                    if (zero_of_run != range_zero) {
++                    if (zero_of_run != zero_of_char) {
+                         retval = FALSE;
+                         break;
+                     }
+                 }
+                 else {
+-                    zero_of_run = range_zero;
++                    zero_of_run = zero_of_char;
+                 }
+             }
+         }
+-- 
+2.17.2
+
diff --git a/perl.spec b/perl.spec
index e5fdd11..78d61d0 100644
--- a/perl.spec
+++ b/perl.spec
@@ -81,7 +81,7 @@ License:        GPL+ or Artistic
 Epoch:          %{perl_epoch}
 Version:        %{perl_version}
 # release number must be even higher, because dual-lived modules will be broken otherwise
-Release:        426%{?dist}
+Release:        427%{?dist}
 Summary:        Practical Extraction and Report Language
 Url:            https://www.perl.org/
 Source0:        https://www.cpan.org/src/5.0/perl-%{perl_version}.tar.xz
@@ -193,6 +193,11 @@ Patch29:        perl-5.29.2-perl-132655-nul-terminate-result-of-unpack-u-of-inva
 # Pass the correct CFLAGS to dtrace
 Patch30:        perl-5.28.0-Pass-CFLAGS-to-dtrace.patch
 
+# Fix script run matching to allow ASCII digits in scripts that use their own in
+# addition, RT#133547, in upstream after 5.29.3
+Patch31:        perl-5.28.1-regexec.c-Rename-variable.patch
+Patch32:        perl-5.28.1-PATCH-perl-133547-script-run-broken.patch
+
 # Link XS modules to libperl.so with EU::CBuilder on Linux, bug #960048
 Patch200:       perl-5.16.3-Link-XS-modules-to-libperl.so-with-EU-CBuilder-on-Li.patch
 
@@ -2774,6 +2779,8 @@ Perl extension for Version Objects
 %patch28 -p1
 %patch29 -p1
 %patch30 -p1
+%patch31 -p1
+%patch32 -p1
 %patch200 -p1
 %patch201 -p1
 
@@ -2806,6 +2813,7 @@ perl -x patchlevel.h \
     'Fedora Patch27: Fix an assignment to a lexical variable in multiconcatenation expressions (RT#133441)' \
     'Fedora Patch28: Fix a spurious warning about uninitialized value in warn (RT#132683)' \
     'Fedora Patch30: Pass the correct CFLAGS to dtrace' \
+    'Fedora Patch31: Fix script run matching to allow ASCII digits in scripts that use their own in addition (RT#133547)' \
     'Fedora Patch200: Link XS modules to libperl.so with EU::CBuilder on Linux' \
     'Fedora Patch201: Link XS modules to libperl.so with EU::MM on Linux' \
     %{nil}
@@ -5094,6 +5102,10 @@ popd
 
 # Old changelog entries are preserved in CVS.
 %changelog
+* Fri Nov 30 2018 Petr Pisar <ppisar@redhat.com> - 4:5.28.1-427
+- Fix script run matching to allow ASCII digits in scripts that use their own in
+  addition (RT#133547)
+
 * Fri Nov 30 2018 Jitka Plesnikova <jplesnik@redhat.com> - 4:5.28.1-426
 - 5.28.1 bump
 - Fix CVE-2018-18312 (heap-buffer-overflow write in regcomp.c)