Fix script run matching to allow ASCII digits in scripts that use their own in addition
This commit is contained in:
parent
16261479ab
commit
a23ff061c5
184
perl-5.28.1-PATCH-perl-133547-script-run-broken.patch
Normal file
184
perl-5.28.1-PATCH-perl-133547-script-run-broken.patch
Normal file
@ -0,0 +1,184 @@
|
||||
From a824afe95b6272148dce1f8bf4bcd20a667412e6 Mon Sep 17 00:00:00 2001
|
||||
From: Karl Williamson <khw@cpan.org>
|
||||
Date: Sun, 30 Sep 2018 10:38:02 -0600
|
||||
Subject: [PATCH] PATCH: [perl #133547]: script run broken
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
All scripts can have the ASCII digits for their numbers. Scripts with
|
||||
their own digits can alternatively use those. Only one of these two
|
||||
sets can be used in a script run. The decision as to which set to use
|
||||
must be deferred until the first digit is encountered, as otherwise we
|
||||
don't know which set will be used. Prior to this commit, the decision
|
||||
was being made prematurely in some cases. As a result of this change,
|
||||
the non-ASCII-digits in the Common script need to be special-cased, and
|
||||
different criteria are used to decide if we need to look up whether a
|
||||
character is a digit or not.
|
||||
|
||||
Petr Písař: Ported to 5.28.1 from
|
||||
393e5a4585b92e635cfc4eee34da8f86f3bfd2af.
|
||||
|
||||
Signed-off-by: Petr Písař <ppisar@redhat.com>
|
||||
---
|
||||
regexec.c | 111 +++++++++++++++++++++++-----------------------
|
||||
t/re/script_run.t | 5 +++
|
||||
2 files changed, 61 insertions(+), 55 deletions(-)
|
||||
|
||||
diff --git a/regexec.c b/regexec.c
|
||||
index 899d979..201d9aa 100644
|
||||
--- a/regexec.c
|
||||
+++ b/regexec.c
|
||||
@@ -10323,6 +10323,10 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
|
||||
|
||||
/* Look at each character in the sequence */
|
||||
while (s < send) {
|
||||
+ /* If the current character being examined is a digit, this is the code
|
||||
+ * point of the zero for its sequence of 10 */
|
||||
+ UV zero_of_char;
|
||||
+
|
||||
UV cp;
|
||||
|
||||
/* The code allows all scripts to use the ASCII digits. This is
|
||||
@@ -10434,16 +10438,6 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
|
||||
script_of_run = script_of_char;
|
||||
}
|
||||
|
||||
- /* All decimal digits must be from the same sequence of 10. Above, we
|
||||
- * handled any ASCII digits without descending to here. We also
|
||||
- * handled the case where we already knew what digit sequence is the
|
||||
- * one to use, and the character is in that sequence. Now that we know
|
||||
- * the script, we can use script_zeros[] to directly find which
|
||||
- * sequence the script uses, except in a few cases it returns 0 */
|
||||
- if (UNLIKELY(zero_of_run == 0) && script_of_char >= 0) {
|
||||
- zero_of_run = script_zeros[script_of_char];
|
||||
- }
|
||||
-
|
||||
/* Now we can see if the script of the character is the same as that of
|
||||
* the run */
|
||||
if (LIKELY(script_of_char == script_of_run)) {
|
||||
@@ -10601,55 +10595,62 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
|
||||
/* Here, the script of the character is compatible with that of the
|
||||
* run. That means that in most cases, it continues the script run.
|
||||
* Either it and the run match exactly, or one or both can be in any of
|
||||
- * several scripts, and the intersection is not empty. But if the
|
||||
- * character is a decimal digit, we need further handling. If we
|
||||
- * haven't seen a digit before, it would establish what set of 10 all
|
||||
- * must come from; and if we have established a set, we need to check
|
||||
- * that this is in it.
|
||||
- *
|
||||
- * But there are cases we can rule out without having to look up if
|
||||
- * this is a digit:
|
||||
- * a. All instances of [0-9] have been dealt with earlier.
|
||||
- * b. The next digit encoded by Unicode is 1600 code points further
|
||||
- * on, so if the code point in this loop iteration is less than
|
||||
- * that, it isn't a digit.
|
||||
- * c. Most scripts that have digits have a single set of 10. If
|
||||
- * we've encountered a digit in such a script, 'zero_of_run' is
|
||||
- * set to the code point (call it z) whose numeric value is 0.
|
||||
- * If the code point in this loop iteration is in the range
|
||||
- * z..z+9, it is in the script's set of 10, and we've actually
|
||||
- * handled it earlier in this function and won't reach this
|
||||
- * point. But, code points in that script that aren't in that
|
||||
- * range can't be digits, so we don't have to look any such up.
|
||||
- * We can tell if this script is such a one by looking at
|
||||
- * 'script_zeros[]' for it. It is non-zero iff it has a single
|
||||
- * set of digits. This rule doesn't apply if we haven't narrowed
|
||||
- * down the possible scripts to a single one yet. Nor if the
|
||||
- * zero of the run is '0', as that also hasn't narrowed things
|
||||
- * down completely */
|
||||
- if ( cp >= FIRST_NON_ASCII_DECIMAL_DIGIT
|
||||
- && ( intersection
|
||||
- || script_of_char < 0 /* Also implies an intersection */
|
||||
- || zero_of_run == '0'
|
||||
- || script_zeros[script_of_char] == 0))
|
||||
+ * several scripts, and the intersection is not empty. However, if the
|
||||
+ * character is a decimal digit, it could still mean failure if it is
|
||||
+ * from the wrong sequence of 10. So, we need to look at if it's a
|
||||
+ * digit. We've already handled the 10 decimal digits, and the next
|
||||
+ * lowest one is this one: */
|
||||
+ if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) {
|
||||
+ continue; /* Not a digit; this character is part of the run */
|
||||
+ }
|
||||
+
|
||||
+ /* If we have a definitive '0' for the script of this character, we
|
||||
+ * know that for this to be a digit, it must be in the range of +0..+9
|
||||
+ * of that zero. */
|
||||
+ if ( script_of_char >= 0
|
||||
+ && (zero_of_char = script_zeros[script_of_char]))
|
||||
{
|
||||
- SSize_t zero_of_char_index;
|
||||
- zero_of_char_index = _invlist_search(decimals_invlist, cp);
|
||||
- if ( LIKELY(zero_of_char_index >= 0)
|
||||
- && ELEMENT_RANGE_MATCHES_INVLIST(zero_of_char_index))
|
||||
+ if ( cp < zero_of_char
|
||||
+ || cp > zero_of_char + 9)
|
||||
{
|
||||
- UV zero_of_char = decimals_array[zero_of_char_index];
|
||||
- if (zero_of_run) {
|
||||
- if (zero_of_run != zero_of_char) {
|
||||
- retval = FALSE;
|
||||
- break;
|
||||
- }
|
||||
- }
|
||||
- else {
|
||||
- zero_of_run = zero_of_char;
|
||||
- }
|
||||
+ continue; /* Not a digit; this character is part of the run
|
||||
+ */
|
||||
+ }
|
||||
+
|
||||
+ }
|
||||
+ else { /* Need to look up if this character is a digit or not */
|
||||
+ SSize_t index_of_zero_of_char;
|
||||
+ index_of_zero_of_char = _invlist_search(decimals_invlist, cp);
|
||||
+ if ( UNLIKELY(index_of_zero_of_char < 0)
|
||||
+ || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char))
|
||||
+ {
|
||||
+ continue; /* Not a digit; this character is part of the run.
|
||||
+ */
|
||||
+ }
|
||||
+
|
||||
+ zero_of_char = decimals_array[index_of_zero_of_char];
|
||||
+ }
|
||||
+
|
||||
+ /* Here, the character is a decimal digit, and the zero of its sequence
|
||||
+ * of 10 is in 'zero_of_char'. If we already have a zero for this run,
|
||||
+ * they better be the same. */
|
||||
+ if (zero_of_run) {
|
||||
+ if (zero_of_run != zero_of_char) {
|
||||
+ retval = FALSE;
|
||||
+ break;
|
||||
}
|
||||
}
|
||||
+ else if (script_of_char == SCX_Common && script_of_run != SCX_Common) {
|
||||
+
|
||||
+ /* Here, the script run isn't Common, but the current digit is in
|
||||
+ * Common, and isn't '0'-'9' (those were handled earlier). Only
|
||||
+ * '0'-'9' are acceptable in non-Common scripts. */
|
||||
+ retval = FALSE;
|
||||
+ break;
|
||||
+ }
|
||||
+ else { /* Otherwise we now have a zero for this run */
|
||||
+ zero_of_run = zero_of_char;
|
||||
+ }
|
||||
} /* end of looping through CLOSESR text */
|
||||
|
||||
Safefree(intersection);
|
||||
diff --git a/t/re/script_run.t b/t/re/script_run.t
|
||||
index 10c7103..f8809e3 100644
|
||||
--- a/t/re/script_run.t
|
||||
+++ b/t/re/script_run.t
|
||||
@@ -97,4 +97,9 @@ foreach my $type ('script_run', 'sr', 'atomic_script_run', 'asr') {
|
||||
like("abc", qr/(*asr:a[bc]*c)/, "Outer asr works on a run");
|
||||
unlike("abc", qr/(*asr:a(*asr:[bc]*)c)/, "Nested asr works to exclude some things");
|
||||
|
||||
+ like("\x{0980}12\x{0993}", qr/^(*sr:.{4})/,
|
||||
+ "Script with own zero works with ASCII digits"); # perl #133547
|
||||
+ like("\x{3041}12\x{3041}", qr/^(*sr:.{4})/,
|
||||
+ "Script without own zero works with ASCII digits");
|
||||
+
|
||||
done_testing();
|
||||
--
|
||||
2.17.2
|
||||
|
54
perl-5.28.1-regexec.c-Rename-variable.patch
Normal file
54
perl-5.28.1-regexec.c-Rename-variable.patch
Normal file
@ -0,0 +1,54 @@
|
||||
From 152f5a590ad349922cc90e3e867a599eced7fada Mon Sep 17 00:00:00 2001
|
||||
From: Karl Williamson <khw@cpan.org>
|
||||
Date: Sun, 30 Sep 2018 10:33:22 -0600
|
||||
Subject: [PATCH] regexec.c: Rename variable
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
The new name is clearer as to its meaning, more so after the next
|
||||
commit.
|
||||
|
||||
Petr Písař: Ported to 5.28.1 from
|
||||
81ec018c6daca2b4c8c87eb335a371b4c90753f3.
|
||||
|
||||
Signed-off-by: Petr Písař <ppisar@redhat.com>
|
||||
---
|
||||
regexec.c | 14 +++++++-------
|
||||
1 file changed, 7 insertions(+), 7 deletions(-)
|
||||
|
||||
diff --git a/regexec.c b/regexec.c
|
||||
index d1a3937..899d979 100644
|
||||
--- a/regexec.c
|
||||
+++ b/regexec.c
|
||||
@@ -10633,20 +10633,20 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
|
||||
|| zero_of_run == '0'
|
||||
|| script_zeros[script_of_char] == 0))
|
||||
{
|
||||
- SSize_t range_zero_index;
|
||||
- range_zero_index = _invlist_search(decimals_invlist, cp);
|
||||
- if ( LIKELY(range_zero_index >= 0)
|
||||
- && ELEMENT_RANGE_MATCHES_INVLIST(range_zero_index))
|
||||
+ SSize_t zero_of_char_index;
|
||||
+ zero_of_char_index = _invlist_search(decimals_invlist, cp);
|
||||
+ if ( LIKELY(zero_of_char_index >= 0)
|
||||
+ && ELEMENT_RANGE_MATCHES_INVLIST(zero_of_char_index))
|
||||
{
|
||||
- UV range_zero = decimals_array[range_zero_index];
|
||||
+ UV zero_of_char = decimals_array[zero_of_char_index];
|
||||
if (zero_of_run) {
|
||||
- if (zero_of_run != range_zero) {
|
||||
+ if (zero_of_run != zero_of_char) {
|
||||
retval = FALSE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else {
|
||||
- zero_of_run = range_zero;
|
||||
+ zero_of_run = zero_of_char;
|
||||
}
|
||||
}
|
||||
}
|
||||
--
|
||||
2.17.2
|
||||
|
14
perl.spec
14
perl.spec
@ -81,7 +81,7 @@ License: GPL+ or Artistic
|
||||
Epoch: %{perl_epoch}
|
||||
Version: %{perl_version}
|
||||
# release number must be even higher, because dual-lived modules will be broken otherwise
|
||||
Release: 426%{?dist}
|
||||
Release: 427%{?dist}
|
||||
Summary: Practical Extraction and Report Language
|
||||
Url: https://www.perl.org/
|
||||
Source0: https://www.cpan.org/src/5.0/perl-%{perl_version}.tar.xz
|
||||
@ -193,6 +193,11 @@ Patch29: perl-5.29.2-perl-132655-nul-terminate-result-of-unpack-u-of-inva
|
||||
# Pass the correct CFLAGS to dtrace
|
||||
Patch30: perl-5.28.0-Pass-CFLAGS-to-dtrace.patch
|
||||
|
||||
# Fix script run matching to allow ASCII digits in scripts that use their own in
|
||||
# addition, RT#133547, in upstream after 5.29.3
|
||||
Patch31: perl-5.28.1-regexec.c-Rename-variable.patch
|
||||
Patch32: perl-5.28.1-PATCH-perl-133547-script-run-broken.patch
|
||||
|
||||
# Link XS modules to libperl.so with EU::CBuilder on Linux, bug #960048
|
||||
Patch200: perl-5.16.3-Link-XS-modules-to-libperl.so-with-EU-CBuilder-on-Li.patch
|
||||
|
||||
@ -2774,6 +2779,8 @@ Perl extension for Version Objects
|
||||
%patch28 -p1
|
||||
%patch29 -p1
|
||||
%patch30 -p1
|
||||
%patch31 -p1
|
||||
%patch32 -p1
|
||||
%patch200 -p1
|
||||
%patch201 -p1
|
||||
|
||||
@ -2806,6 +2813,7 @@ perl -x patchlevel.h \
|
||||
'Fedora Patch27: Fix an assignment to a lexical variable in multiconcatenation expressions (RT#133441)' \
|
||||
'Fedora Patch28: Fix a spurious warning about uninitialized value in warn (RT#132683)' \
|
||||
'Fedora Patch30: Pass the correct CFLAGS to dtrace' \
|
||||
'Fedora Patch31: Fix script run matching to allow ASCII digits in scripts that use their own in addition (RT#133547)' \
|
||||
'Fedora Patch200: Link XS modules to libperl.so with EU::CBuilder on Linux' \
|
||||
'Fedora Patch201: Link XS modules to libperl.so with EU::MM on Linux' \
|
||||
%{nil}
|
||||
@ -5094,6 +5102,10 @@ popd
|
||||
|
||||
# Old changelog entries are preserved in CVS.
|
||||
%changelog
|
||||
* Fri Nov 30 2018 Petr Pisar <ppisar@redhat.com> - 4:5.28.1-427
|
||||
- Fix script run matching to allow ASCII digits in scripts that use their own in
|
||||
addition (RT#133547)
|
||||
|
||||
* Fri Nov 30 2018 Jitka Plesnikova <jplesnik@redhat.com> - 4:5.28.1-426
|
||||
- 5.28.1 bump
|
||||
- Fix CVE-2018-18312 (heap-buffer-overflow write in regcomp.c)
|
||||
|
Loading…
Reference in New Issue
Block a user