You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
98 lines
3.8 KiB
98 lines
3.8 KiB
From 1b90dad20879f0e7a3eced5da0e0aacda93708ed Mon Sep 17 00:00:00 2001
|
|
From: Yves Orton <demerphq@gmail.com>
|
|
Date: Thu, 27 Oct 2016 13:52:24 +0200
|
|
Subject: [PATCH] regcomp.c: fix perl #129950 - fix firstchar bitmap under utf8
|
|
with prefix optimisation
|
|
MIME-Version: 1.0
|
|
Content-Type: text/plain; charset=UTF-8
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
Ported to 5.24.0:
|
|
|
|
commit da42332b10691ba7af7550035ffc7f46c87e4e66
|
|
Author: Yves Orton <demerphq@gmail.com>
|
|
Date: Thu Oct 27 13:52:24 2016 +0200
|
|
|
|
regcomp.c: fix perl #129950 - fix firstchar bitmap under utf8 with prefix optimisation
|
|
|
|
The trie code contains a number of sub optimisations, one of which
|
|
extracts common prefixes from alternations, and another which isa
|
|
bitmap of the possible matching first chars.
|
|
|
|
The bitmap needs to contain the possible first octets of the string
|
|
which the trie can match, and for codepoints which might have a different
|
|
first octet under utf8 or non-utf8 need to register BOTH codepoints.
|
|
|
|
So for instance in the pattern (?:a|a\x{E4}) we should restructure this
|
|
as a(|\x{E4), and the bitmap for the trie should contain both \x{E4} AND
|
|
\x{C3} as \x{C3} is the first byte of \x{EF} expressed as utf8.
|
|
|
|
Signed-off-by: Petr Písař <ppisar@redhat.com>
|
|
---
|
|
regcomp.c | 14 ++++++++++++++
|
|
t/re/pat.t | 9 ++++++++-
|
|
2 files changed, 22 insertions(+), 1 deletion(-)
|
|
|
|
diff --git a/regcomp.c b/regcomp.c
|
|
index 7462885..bcb8db5 100644
|
|
--- a/regcomp.c
|
|
+++ b/regcomp.c
|
|
@@ -3272,6 +3272,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
|
|
TRIE_BITMAP_SET(trie,*ch);
|
|
if ( folder )
|
|
TRIE_BITMAP_SET(trie, folder[ *ch ]);
|
|
+ if ( !UTF ) {
|
|
+ /* store first byte of utf8 representation of
|
|
+ variant codepoints */
|
|
+ if (! UVCHR_IS_INVARIANT(*ch)) {
|
|
+ TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(*ch));
|
|
+ }
|
|
+ }
|
|
DEBUG_OPTIMISE_r(
|
|
Perl_re_printf( aTHX_ "%s", (char*)ch)
|
|
);
|
|
@@ -3280,6 +3287,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
|
|
TRIE_BITMAP_SET(trie,*ch);
|
|
if ( folder )
|
|
TRIE_BITMAP_SET(trie,folder[ *ch ]);
|
|
+ if ( !UTF ) {
|
|
+ /* store first byte of utf8 representation of
|
|
+ variant codepoints */
|
|
+ if (! UVCHR_IS_INVARIANT(*ch)) {
|
|
+ TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(*ch));
|
|
+ }
|
|
+ }
|
|
DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
|
|
}
|
|
idx = ofs;
|
|
diff --git a/t/re/pat.t b/t/re/pat.t
|
|
index 295a9f7..4aa77cf 100644
|
|
--- a/t/re/pat.t
|
|
+++ b/t/re/pat.t
|
|
@@ -23,7 +23,7 @@ BEGIN {
|
|
skip_all_without_unicode_tables();
|
|
}
|
|
|
|
-plan tests => 789; # Update this when adding/deleting tests.
|
|
+plan tests => 791; # Update this when adding/deleting tests.
|
|
|
|
run_tests() unless caller;
|
|
|
|
@@ -1758,6 +1758,13 @@ EOP
|
|
fresh_perl_is($code, $expect, {}, "$bug - $test_name" );
|
|
}
|
|
}
|
|
+
|
|
+ {
|
|
+ my $str = "a\xE4";
|
|
+ ok( $str =~ m{^(a|a\x{e4})$}, "fix [perl #129950] - latin1 case" );
|
|
+ utf8::upgrade($str);
|
|
+ ok( $str =~ m{^(a|a\x{e4})$}, "fix [perl #129950] - utf8 case" );
|
|
+ }
|
|
} # End of sub run_tests
|
|
|
|
1;
|
|
--
|
|
2.7.4
|
|
|