|
|
From 10ce49389ea9ee26a3b02b6494b0a3849d56c6fa Mon Sep 17 00:00:00 2001
|
|
|
From: Yves Orton <demerphq@gmail.com>
|
|
|
Date: Mon, 26 Jun 2017 13:19:55 +0200
|
|
|
Subject: [PATCH] fix #131649 - extended charclass can trigger assert
|
|
|
|
|
|
The extended charclass parser makes some assumptions during the
|
|
|
first pass which are only true on well structured input, and it
|
|
|
does not properly catch various errors. later on the code assumes
|
|
|
that things the first pass will let through are valid, when in
|
|
|
fact they should trigger errors.
|
|
|
|
|
|
(cherry picked from commit 19a498a461d7c81ae3507c450953d1148efecf4f)
|
|
|
---
|
|
|
pod/perldiag.pod | 27 ++++++++++++++++++++++++++-
|
|
|
pod/perlrecharclass.pod | 4 ++--
|
|
|
regcomp.c | 28 ++++++++++++++++++----------
|
|
|
t/lib/warnings/regcomp | 6 +++---
|
|
|
t/re/reg_mesg.t | 29 ++++++++++++++++-------------
|
|
|
t/re/regex_sets.t | 6 +++---
|
|
|
6 files changed, 68 insertions(+), 32 deletions(-)
|
|
|
|
|
|
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
|
|
|
index 106fe41121..c29925a2a4 100644
|
|
|
--- a/pod/perldiag.pod
|
|
|
+++ b/pod/perldiag.pod
|
|
|
@@ -5904,7 +5904,7 @@ yourself.
|
|
|
a perl4 interpreter, especially if the next 2 tokens are "use strict"
|
|
|
or "my $var" or "our $var".
|
|
|
|
|
|
-=item Syntax error in (?[...]) in regex m/%s/
|
|
|
+=item Syntax error in (?[...]) in regex; marked by <-- HERE in m/%s/
|
|
|
|
|
|
(F) Perl could not figure out what you meant inside this construct; this
|
|
|
notifies you that it is giving up trying.
|
|
|
@@ -6402,6 +6402,31 @@ to find out why that isn't happening.
|
|
|
(F) The unexec() routine failed for some reason. See your local FSF
|
|
|
representative, who probably put it there in the first place.
|
|
|
|
|
|
+=item Unexpected ']' with no following ')' in (?[... in regex; marked by <-- HERE in m/%s/
|
|
|
+
|
|
|
+(F) While parsing an extended character class a ']' character was encountered
|
|
|
+at a point in the definition where the only legal use of ']' is to close the
|
|
|
+character class definition as part of a '])', you may have forgotten the close
|
|
|
+paren, or otherwise confused the parser.
|
|
|
+
|
|
|
+=item Expecting close paren for nested extended charclass in regex; marked by <-- HERE in m/%s/
|
|
|
+
|
|
|
+(F) While parsing a nested extended character class like:
|
|
|
+
|
|
|
+ (?[ ... (?flags:(?[ ... ])) ... ])
|
|
|
+ ^
|
|
|
+
|
|
|
+we expected to see a close paren ')' (marked by ^) but did not.
|
|
|
+
|
|
|
+=item Expecting close paren for wrapper for nested extended charclass in regex; marked by <-- HERE in m/%s/
|
|
|
+
|
|
|
+(F) While parsing a nested extended character class like:
|
|
|
+
|
|
|
+ (?[ ... (?flags:(?[ ... ])) ... ])
|
|
|
+ ^
|
|
|
+
|
|
|
+we expected to see a close paren ')' (marked by ^) but did not.
|
|
|
+
|
|
|
=item Unexpected binary operator '%c' with no preceding operand in regex;
|
|
|
marked by S<<-- HERE> in m/%s/
|
|
|
|
|
|
diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod
|
|
|
index 79480e4131..8c008507d1 100644
|
|
|
--- a/pod/perlrecharclass.pod
|
|
|
+++ b/pod/perlrecharclass.pod
|
|
|
@@ -1128,8 +1128,8 @@ hence both of the following work:
|
|
|
Any contained POSIX character classes, including things like C<\w> and C<\D>
|
|
|
respect the C<E<sol>a> (and C<E<sol>aa>) modifiers.
|
|
|
|
|
|
-C<< (?[ ]) >> is a regex-compile-time construct. Any attempt to use
|
|
|
-something which isn't knowable at the time the containing regular
|
|
|
+Note that C<< (?[ ]) >> is a regex-compile-time construct. Any attempt
|
|
|
+to use something which isn't knowable at the time the containing regular
|
|
|
expression is compiled is a fatal error. In practice, this means
|
|
|
just three limitations:
|
|
|
|
|
|
diff --git a/regcomp.c b/regcomp.c
|
|
|
index 4ee48ede42..ddac290d2b 100644
|
|
|
--- a/regcomp.c
|
|
|
+++ b/regcomp.c
|
|
|
@@ -14840,8 +14840,9 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
|
|
|
TRUE /* Force /x */ );
|
|
|
|
|
|
switch (*RExC_parse) {
|
|
|
- case '?':
|
|
|
- if (RExC_parse[1] == '[') depth++, RExC_parse++;
|
|
|
+ case '(':
|
|
|
+ if (RExC_parse[1] == '?' && RExC_parse[2] == '[')
|
|
|
+ depth++, RExC_parse+=2;
|
|
|
/* FALLTHROUGH */
|
|
|
default:
|
|
|
break;
|
|
|
@@ -14898,9 +14899,9 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
|
|
|
}
|
|
|
|
|
|
case ']':
|
|
|
- if (depth--) break;
|
|
|
- RExC_parse++;
|
|
|
- if (*RExC_parse == ')') {
|
|
|
+ if (RExC_parse[1] == ')') {
|
|
|
+ RExC_parse++;
|
|
|
+ if (depth--) break;
|
|
|
node = reganode(pRExC_state, ANYOF, 0);
|
|
|
RExC_size += ANYOF_SKIP;
|
|
|
nextchar(pRExC_state);
|
|
|
@@ -14912,20 +14913,25 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
|
|
|
|
|
|
return node;
|
|
|
}
|
|
|
- goto no_close;
|
|
|
+ /* We output the messages even if warnings are off, because we'll fail
|
|
|
+ * the very next thing, and these give a likely diagnosis for that */
|
|
|
+ if (posix_warnings && av_tindex_nomg(posix_warnings) >= 0) {
|
|
|
+ output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
|
|
|
+ }
|
|
|
+ RExC_parse++;
|
|
|
+ vFAIL("Unexpected ']' with no following ')' in (?[...");
|
|
|
}
|
|
|
|
|
|
RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
|
|
|
}
|
|
|
|
|
|
- no_close:
|
|
|
/* We output the messages even if warnings are off, because we'll fail
|
|
|
* the very next thing, and these give a likely diagnosis for that */
|
|
|
if (posix_warnings && av_tindex_nomg(posix_warnings) >= 0) {
|
|
|
output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
|
|
|
}
|
|
|
|
|
|
- FAIL("Syntax error in (?[...])");
|
|
|
+ vFAIL("Syntax error in (?[...])");
|
|
|
}
|
|
|
|
|
|
/* Pass 2 only after this. */
|
|
|
@@ -15105,12 +15111,14 @@ redo_curchar:
|
|
|
* inversion list, and RExC_parse points to the trailing
|
|
|
* ']'; the next character should be the ')' */
|
|
|
RExC_parse++;
|
|
|
- assert(UCHARAT(RExC_parse) == ')');
|
|
|
+ if (UCHARAT(RExC_parse) != ')')
|
|
|
+ vFAIL("Expecting close paren for nested extended charclass");
|
|
|
|
|
|
/* Then the ')' matching the original '(' handled by this
|
|
|
* case: statement */
|
|
|
RExC_parse++;
|
|
|
- assert(UCHARAT(RExC_parse) == ')');
|
|
|
+ if (UCHARAT(RExC_parse) != ')')
|
|
|
+ vFAIL("Expecting close paren for wrapper for nested extended charclass");
|
|
|
|
|
|
RExC_parse++;
|
|
|
RExC_flags = save_flags;
|
|
|
diff --git a/t/lib/warnings/regcomp b/t/lib/warnings/regcomp
|
|
|
index 2b084c59b0..51ad57ccbe 100644
|
|
|
--- a/t/lib/warnings/regcomp
|
|
|
+++ b/t/lib/warnings/regcomp
|
|
|
@@ -59,21 +59,21 @@ Unmatched [ in regex; marked by <-- HERE in m/abc[ <-- HERE fi[.00./ at - line
|
|
|
qr/(?[[[:word]]])/;
|
|
|
EXPECT
|
|
|
Assuming NOT a POSIX class since there is no terminating ':' in regex; marked by <-- HERE in m/(?[[[:word <-- HERE ]]])/ at - line 2.
|
|
|
-syntax error in (?[...]) in regex m/(?[[[:word]]])/ at - line 2.
|
|
|
+Unexpected ']' with no following ')' in (?[... in regex; marked by <-- HERE in m/(?[[[:word]] <-- HERE ])/ at - line 2.
|
|
|
########
|
|
|
# NAME qr/(?[ [[:digit: ])/
|
|
|
# OPTION fatal
|
|
|
qr/(?[[[:digit: ])/;
|
|
|
EXPECT
|
|
|
Assuming NOT a POSIX class since no blanks are allowed in one in regex; marked by <-- HERE in m/(?[[[:digit: ] <-- HERE )/ at - line 2.
|
|
|
-syntax error in (?[...]) in regex m/(?[[[:digit: ])/ at - line 2.
|
|
|
+syntax error in (?[...]) in regex; marked by <-- HERE in m/(?[[[:digit: ]) <-- HERE / at - line 2.
|
|
|
########
|
|
|
# NAME qr/(?[ [:digit: ])/
|
|
|
# OPTION fatal
|
|
|
qr/(?[[:digit: ])/
|
|
|
EXPECT
|
|
|
Assuming NOT a POSIX class since no blanks are allowed in one in regex; marked by <-- HERE in m/(?[[:digit: ] <-- HERE )/ at - line 2.
|
|
|
-syntax error in (?[...]) in regex m/(?[[:digit: ])/ at - line 2.
|
|
|
+syntax error in (?[...]) in regex; marked by <-- HERE in m/(?[[:digit: ]) <-- HERE / at - line 2.
|
|
|
########
|
|
|
# NAME [perl #126141]
|
|
|
# OPTION fatal
|
|
|
diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t
|
|
|
index d26a7caf37..5194d93751 100644
|
|
|
--- a/t/re/reg_mesg.t
|
|
|
+++ b/t/re/reg_mesg.t
|
|
|
@@ -215,8 +215,9 @@ my @death =
|
|
|
'/\b{gc}/' => "'gc' is an unknown bound type {#} m/\\b{gc{#}}/",
|
|
|
'/\B{gc}/' => "'gc' is an unknown bound type {#} m/\\B{gc{#}}/",
|
|
|
|
|
|
- '/(?[[[::]]])/' => "Syntax error in (?[...]) in regex m/(?[[[::]]])/",
|
|
|
- '/(?[[[:w:]]])/' => "Syntax error in (?[...]) in regex m/(?[[[:w:]]])/",
|
|
|
+
|
|
|
+ '/(?[[[::]]])/' => "Unexpected ']' with no following ')' in (?[... {#} m/(?[[[::]]{#}])/",
|
|
|
+ '/(?[[[:w:]]])/' => "Unexpected ']' with no following ')' in (?[... {#} m/(?[[[:w:]]{#}])/",
|
|
|
'/(?[[:w:]])/' => "",
|
|
|
'/([.].*)[.]/' => "", # [perl #127582]
|
|
|
'/[.].*[.]/' => "", # [perl #127604]
|
|
|
@@ -239,11 +240,12 @@ my @death =
|
|
|
'/(?[ \p{foo} ])/' => 'Can\'t find Unicode property definition "foo" {#} m/(?[ \p{foo}{#} ])/',
|
|
|
'/(?[ \p{ foo = bar } ])/' => 'Can\'t find Unicode property definition "foo = bar" {#} m/(?[ \p{ foo = bar }{#} ])/',
|
|
|
'/(?[ \8 ])/' => 'Unrecognized escape \8 in character class {#} m/(?[ \8{#} ])/',
|
|
|
- '/(?[ \t ]/' => 'Syntax error in (?[...]) in regex m/(?[ \t ]/',
|
|
|
- '/(?[ [ \t ]/' => 'Syntax error in (?[...]) in regex m/(?[ [ \t ]/',
|
|
|
- '/(?[ \t ] ]/' => 'Syntax error in (?[...]) in regex m/(?[ \t ] ]/',
|
|
|
- '/(?[ [ ] ]/' => 'Syntax error in (?[...]) in regex m/(?[ [ ] ]/',
|
|
|
- '/(?[ \t + \e # This was supposed to be a comment ])/' => 'Syntax error in (?[...]) in regex m/(?[ \t + \e # This was supposed to be a comment ])/',
|
|
|
+ '/(?[ \t ]/' => "Unexpected ']' with no following ')' in (?[... {#} m/(?[ \\t ]{#}/",
|
|
|
+ '/(?[ [ \t ]/' => "Syntax error in (?[...]) {#} m/(?[ [ \\t ]{#}/",
|
|
|
+ '/(?[ \t ] ]/' => "Unexpected ']' with no following ')' in (?[... {#} m/(?[ \\t ]{#} ]/",
|
|
|
+ '/(?[ [ ] ]/' => "Syntax error in (?[...]) {#} m/(?[ [ ] ]{#}/",
|
|
|
+ '/(?[ \t + \e # This was supposed to be a comment ])/' =>
|
|
|
+ "Syntax error in (?[...]) {#} m/(?[ \\t + \\e # This was supposed to be a comment ]){#}/",
|
|
|
'/(?[ ])/' => 'Incomplete expression within \'(?[ ])\' {#} m/(?[ {#}])/',
|
|
|
'm/(?[[a-\d]])/' => 'False [] range "a-\d" {#} m/(?[[a-\d{#}]])/',
|
|
|
'm/(?[[\w-x]])/' => 'False [] range "\w-" {#} m/(?[[\w-{#}x]])/',
|
|
|
@@ -431,10 +433,10 @@ my @death_utf8 = mark_as_utf8(
|
|
|
|
|
|
'/ネ\p{}ネ/' => 'Empty \p{} {#} m/ネ\p{{#}}ネ/',
|
|
|
|
|
|
- '/ネ(?[[[:ネ]]])ネ/' => "Syntax error in (?[...]) in regex m/ネ(?[[[:ネ]]])ネ/",
|
|
|
- '/ネ(?[[[:ネ: ])ネ/' => "Syntax error in (?[...]) in regex m/ネ(?[[[:ネ: ])ネ/",
|
|
|
- '/ネ(?[[[::]]])ネ/' => "Syntax error in (?[...]) in regex m/ネ(?[[[::]]])ネ/",
|
|
|
- '/ネ(?[[[:ネ:]]])ネ/' => "Syntax error in (?[...]) in regex m/ネ(?[[[:ネ:]]])ネ/",
|
|
|
+ '/ネ(?[[[:ネ]]])ネ/' => "Unexpected ']' with no following ')' in (?[... {#} m/ネ(?[[[:ネ]]{#}])ネ/",
|
|
|
+ '/ネ(?[[[:ネ: ])ネ/' => "Syntax error in (?[...]) {#} m/ネ(?[[[:ネ: ])ネ{#}/",
|
|
|
+ '/ネ(?[[[::]]])ネ/' => "Unexpected ']' with no following ')' in (?[... {#} m/ネ(?[[[::]]{#}])ネ/",
|
|
|
+ '/ネ(?[[[:ネ:]]])ネ/' => "Unexpected ']' with no following ')' in (?[... {#} m/ネ(?[[[:ネ:]]{#}])ネ/",
|
|
|
'/ネ(?[[:ネ:]])ネ/' => "",
|
|
|
'/ネ(?[ネ])ネ/' => 'Unexpected character {#} m/ネ(?[ネ{#}])ネ/',
|
|
|
'/ネ(?[ + [ネ] ])/' => 'Unexpected binary operator \'+\' with no preceding operand {#} m/ネ(?[ +{#} [ネ] ])/',
|
|
|
@@ -447,8 +449,9 @@ my @death_utf8 = mark_as_utf8(
|
|
|
'/(?[ \x{ネ} ])ネ/' => 'Non-hex character {#} m/(?[ \x{ネ{#}} ])ネ/',
|
|
|
'/(?[ \p{ネ} ])/' => 'Can\'t find Unicode property definition "ネ" {#} m/(?[ \p{ネ}{#} ])/',
|
|
|
'/(?[ \p{ ネ = bar } ])/' => 'Can\'t find Unicode property definition "ネ = bar" {#} m/(?[ \p{ ネ = bar }{#} ])/',
|
|
|
- '/ネ(?[ \t ]/' => 'Syntax error in (?[...]) in regex m/ネ(?[ \t ]/',
|
|
|
- '/(?[ \t + \e # ネ This was supposed to be a comment ])/' => 'Syntax error in (?[...]) in regex m/(?[ \t + \e # ネ This was supposed to be a comment ])/',
|
|
|
+ '/ネ(?[ \t ]/' => "Unexpected ']' with no following ')' in (?[... {#} m/ネ(?[ \\t ]{#}/",
|
|
|
+ '/(?[ \t + \e # ネ This was supposed to be a comment ])/' =>
|
|
|
+ "Syntax error in (?[...]) {#} m/(?[ \\t + \\e # ネ This was supposed to be a comment ]){#}/",
|
|
|
'm/(*ネ)ネ/' => q<Unknown verb pattern 'ネ' {#} m/(*ネ){#}ネ/>,
|
|
|
'/\cネ/' => "Character following \"\\c\" must be printable ASCII",
|
|
|
'/\b{ネ}/' => "'ネ' is an unknown bound type {#} m/\\b{ネ{#}}/",
|
|
|
diff --git a/t/re/regex_sets.t b/t/re/regex_sets.t
|
|
|
index 6a79f9d692..e9644bd4e6 100644
|
|
|
--- a/t/re/regex_sets.t
|
|
|
+++ b/t/re/regex_sets.t
|
|
|
@@ -158,13 +158,13 @@ for my $char ("٠", "٥", "٩") {
|
|
|
eval { $_ = '/(?[(\c]) /'; qr/$_/ };
|
|
|
like($@, qr/^Syntax error/, '/(?[(\c]) / should not panic');
|
|
|
eval { $_ = '(?[\c#]' . "\n])"; qr/$_/ };
|
|
|
- like($@, qr/^Syntax error/, '/(?[(\c]) / should not panic');
|
|
|
+ like($@, qr/^Unexpected/, '/(?[(\c]) / should not panic');
|
|
|
eval { $_ = '(?[(\c])'; qr/$_/ };
|
|
|
like($@, qr/^Syntax error/, '/(?[(\c])/ should be a syntax error');
|
|
|
eval { $_ = '(?[(\c]) ]\b'; qr/$_/ };
|
|
|
- like($@, qr/^Syntax error/, '/(?[(\c]) ]\b/ should be a syntax error');
|
|
|
+ like($@, qr/^Unexpected/, '/(?[(\c]) ]\b/ should be a syntax error');
|
|
|
eval { $_ = '(?[\c[]](])'; qr/$_/ };
|
|
|
- like($@, qr/^Syntax error/, '/(?[\c[]](])/ should be a syntax error');
|
|
|
+ like($@, qr/^Unexpected/, '/(?[\c[]](])/ should be a syntax error');
|
|
|
like("\c#", qr/(?[\c#])/, '\c# should match itself');
|
|
|
like("\c[", qr/(?[\c[])/, '\c[ should match itself');
|
|
|
like("\c\ ", qr/(?[\c\])/, '\c\ should match itself');
|
|
|
--
|
|
|
2.11.0
|
|
|
|