Fix matching an ASCII digit followed by a non-ASCII digit using a script run

2018-09-05 15:54:07 +02:00 · 2018-09-05 15:54:07 +02:00 · e82b9306ac
commit e82b9306ac
parent e039a7964c
2 changed files with 107 additions and 0 deletions
--- a/perl-5.28.0-Fix-script-run-bug-1-followed-by-Thai-digit.patch
+++ b/perl-5.28.0-Fix-script-run-bug-1-followed-by-Thai-digit.patch
@ -0,0 +1,100 @@
+From 7b4a3fe1d488df004e3969802fe121697cd3d6e5 Mon Sep 17 00:00:00 2001
+From: Karl Williamson <khw@cpan.org>
+Date: Thu, 16 Aug 2018 16:14:01 -0600
+Subject: [PATCH] Fix script run bug '1' followed by Thai digit
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This does not have a ticket, but was pointed out in
+http://nntp.perl.org/group/perl.perl5.porters/251870
+
+The logic for deciding if it was needed to check if a character is a
+digit was flawed.
+
+Petr Písař: Ported to 5.28.0.
+
+Signed-off-by: Petr Písař <ppisar@redhat.com>
+---
+ regexec.c         | 46 +++++++++++++++++++++++++++++++---------------
+ t/re/script_run.t |  5 +++++
+ 2 files changed, 36 insertions(+), 15 deletions(-)
+
+diff --git a/regexec.c b/regexec.c
+index 95bb254..d1a3937 100644
+--- a/regexec.c
+++ b/regexec.c
+@@ -10599,23 +10599,39 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
+   scripts_match:
+ 
+         /* Here, the script of the character is compatible with that of the
+-         * run.  Either they match exactly, or one or both can be any of
+-         * several scripts, and the intersection is not empty.  If the
+-         * character is not a decimal digit, we are done with it.  Otherwise,
+-         * it could still fail if it is from a different set of 10 than seen
+-         * already (or we may not have seen any, and we need to set the
+-         * sequence).  If we have determined a single script and that script
+-         * only has one set of digits (almost all scripts are like that), then
+-         * this isn't a problem, as any digit must come from the same sequence.
+-         * The only scripts that have multiple sequences have been constructed
+-         * to be 0 in 'script_zeros[]'.
+         * run.  That means that in most cases, it continues the script run.
+         * Either it and the run match exactly, or one or both can be in any of
+         * several scripts, and the intersection is not empty.  But if the
+         * character is a decimal digit, we need further handling.  If we
+         * haven't seen a digit before, it would establish what set of 10 all
+         * must come from; and if we have established a set, we need to check
+         * that this is in it.
+          *
+-         * Here we check if it is a digit. */
+         * But there are cases we can rule out without having to look up if
+         * this is a digit:
+         *   a.  All instances of [0-9] have been dealt with earlier.
+         *   b.  The next digit encoded by Unicode is 1600 code points further
+         *       on, so if the code point in this loop iteration is less than
+         *       that, it isn't a digit.
+         *   c.  Most scripts that have digits have a single set of 10.  If
+         *       we've encountered a digit in such a script, 'zero_of_run' is
+         *       set to the code point (call it z) whose numeric value is 0.
+         *       If the code point in this loop iteration is in the range
+         *       z..z+9, it is in the script's set of 10, and we've actually
+         *       handled it earlier in this function and won't reach this
+         *       point.  But, code points in that script that aren't in that
+         *       range can't be digits, so we don't have to look any such up.
+         *       We can tell if this script is such a one by looking at
+         *       'script_zeros[]' for it.  It is non-zero iff it has a single
+         *       set of digits.  This rule doesn't apply if we haven't narrowed
+         *       down the possible scripts to a single one yet.  Nor if the
+         *       zero of the run is '0', as that also hasn't narrowed things
+         *       down completely */
+         if (    cp >= FIRST_NON_ASCII_DECIMAL_DIGIT
+-            && (   (          zero_of_run == 0
+-                    || (  (   script_of_char >= 0
+-                           && script_zeros[script_of_char] == 0)
+-                        ||    intersection))))
+            && (   intersection
+                || script_of_char < 0   /* Also implies an intersection */
+                || zero_of_run == '0'
+                || script_zeros[script_of_char] == 0))
+         {
+             SSize_t range_zero_index;
+             range_zero_index = _invlist_search(decimals_invlist, cp);
+diff --git a/t/re/script_run.t b/t/re/script_run.t
+index ca234d9..10c7103 100644
+--- a/t/re/script_run.t
+++ b/t/re/script_run.t
+@@ -84,6 +84,11 @@ foreach my $type ('script_run', 'sr', 'atomic_script_run', 'asr') {
+ 
+     # From UTS 39
+     like("写真だけの結婚式", $script_run, "Mixed Hiragana and Han");
+
+    unlike "\N{THAI DIGIT FIVE}1", $script_run, "Thai digit followed by '1'";
+    unlike "1\N{THAI DIGIT FIVE}", $script_run, "'1' followed by Thai digit ";
+    unlike "\N{BENGALI DIGIT ZERO}\N{CHAKMA DIGIT SEVEN}", $script_run,
+           "Two digits in same extended script but from different sets of 10";
+ }
+ 
+     # Until fixed, this was skipping the '['
+-- 
+2.14.4
+
--- a/perl.spec
+++ b/perl.spec
@ -190,6 +190,10 @@ Patch26:        perl-5.29.1-Make-utf8_to_uvchr-slightly-safer.patch
 # Fix a time race in Time-HiRes/t/itimer.t test, in upstream after 5.29.1
 Patch27:        perl-5.29.1-Time-HiRes-t-itimer.t-avoid-race-condition.patch

+# Fix matching an ASCII digit followed by a non-ASCII digit using a script
+# run, in upstream after 5.29.1
+Patch28:        perl-5.28.0-Fix-script-run-bug-1-followed-by-Thai-digit.patch
+
 # Link XS modules to libperl.so with EU::CBuilder on Linux, bug #960048
 Patch200:       perl-5.16.3-Link-XS-modules-to-libperl.so-with-EU-CBuilder-on-Li.patch

@ -2769,6 +2773,7 @@ Perl extension for Version Objects
 %patch25 -p1
 %patch26 -p1
 %patch27 -p1
+%patch28 -p1
 %patch200 -p1
 %patch201 -p1

@ -2803,6 +2808,7 @@ perl -x patchlevel.h \
    'Fedora Patch25: Fix a buffer overrun in deprecated utf8_to_uvchr()' \
    'Fedora Patch26: Fix a buffer overrun in deprecated utf8_to_uvchr()' \
    'Fedora Patch27: Fix a time race in Time-HiRes/t/itimer.t test' \
+    'Fedora Patch28: Fix matching an ASCII digit followed by a non-ASCII digit using a script run' \
    'Fedora Patch200: Link XS modules to libperl.so with EU::CBuilder on Linux' \
    'Fedora Patch201: Link XS modules to libperl.so with EU::MM on Linux' \
    %{nil}
@ -5095,6 +5101,7 @@ popd
 - Fix a buffer overrun in deprecated S_is_utf8_common()
 - Fix a buffer overrun in deprecated utf8_to_uvchr()
 - Fix a time race in Time-HiRes/t/itimer.t test
+- Fix matching an ASCII digit followed by a non-ASCII digit using a script run

 * Wed Aug 01 2018 Petr Pisar <ppisar@redhat.com> - 4:5.28.0-420
 - Fix a file descriptor leak in in-place edits (RT#133314)