You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
rust-grep/0001-grep-upgrade-to-regex-...

461 lines
17 KiB

From cd08707c7c82058559bd5557efb3c1d0379dbf1d Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Tue, 13 Mar 2018 20:38:50 -0400
Subject: [PATCH 1/2] grep: upgrade to regex-syntax 0.5
This update brings with it many bug fixes:
* Better error messages are printed overall. We also include
explicit call out for unsupported features like backreferences
and look-around.
* Regexes like `\s*{` no longer emit incomprehensible errors.
* Unicode escape sequences, such as `\u{..}` are now supported.
For the most part, this upgrade was done in a straight-forward way. We
resist the urge to refactor the `grep` crate, in anticipation of it
being rewritten anyway.
Note that we removed the `--fixed-strings` suggestion whenever a regex
syntax error occurs. In practice, I've found that it results in a lot of
false positives, and I believe that its use is not as paramount now that
regex parse errors are much more readable.
Closes #268, Closes #395, Closes #702, Closes #853
---
src/literals.rs | 119 +++++++++++++++++++++++-----------------------
src/nonl.rs | 85 ++++++++++++++++++---------------
src/search.rs | 19 ++++----
src/word_boundary.rs | 31 ++++++------
4 files changed, 130 insertions(+), 124 deletions(-)
diff --git a/src/literals.rs b/src/literals.rs
index eebeac4c7249..3e1c385bcda9 100644
--- a/src/literals.rs
+++ b/src/literals.rs
@@ -10,10 +10,8 @@ principled.
use std::cmp;
use regex::bytes::RegexBuilder;
-use syntax::{
- Expr, Literals, Lit,
- ByteClass, ByteRange, CharClass, ClassRange, Repeater,
-};
+use syntax::hir::{self, Hir, HirKind};
+use syntax::hir::literal::{Literal, Literals};
#[derive(Clone, Debug)]
pub struct LiteralSets {
@@ -23,12 +21,12 @@ pub struct LiteralSets {
}
impl LiteralSets {
- pub fn create(expr: &Expr) -> Self {
+ pub fn create(expr: &Hir) -> Self {
let mut required = Literals::empty();
union_required(expr, &mut required);
LiteralSets {
- prefixes: expr.prefixes(),
- suffixes: expr.suffixes(),
+ prefixes: Literals::prefixes(expr),
+ suffixes: Literals::suffixes(expr),
required: required,
}
}
@@ -93,60 +91,52 @@ impl LiteralSets {
}
}
-fn union_required(expr: &Expr, lits: &mut Literals) {
- use syntax::Expr::*;
- match *expr {
- Literal { ref chars, casei: false } => {
- let s: String = chars.iter().cloned().collect();
- lits.cross_add(s.as_bytes());
+fn union_required(expr: &Hir, lits: &mut Literals) {
+ match *expr.kind() {
+ HirKind::Literal(hir::Literal::Unicode(c)) => {
+ let mut buf = [0u8; 4];
+ lits.cross_add(c.encode_utf8(&mut buf).as_bytes());
}
- Literal { ref chars, casei: true } => {
- for &c in chars {
- let cls = CharClass::new(vec![
- ClassRange { start: c, end: c },
- ]).case_fold();
- if !lits.add_char_class(&cls) {
- lits.cut();
- return;
- }
+ HirKind::Literal(hir::Literal::Byte(b)) => {
+ lits.cross_add(&[b]);
+ }
+ HirKind::Class(hir::Class::Unicode(ref cls)) => {
+ if count_unicode_class(cls) >= 5 || !lits.add_char_class(cls) {
+ lits.cut();
+ }
+ }
+ HirKind::Class(hir::Class::Bytes(ref cls)) => {
+ if count_byte_class(cls) >= 5 || !lits.add_byte_class(cls) {
+ lits.cut();
}
}
- LiteralBytes { ref bytes, casei: false } => {
- lits.cross_add(bytes);
+ HirKind::Group(hir::Group { ref hir, .. }) => {
+ union_required(&**hir, lits);
}
- LiteralBytes { ref bytes, casei: true } => {
- for &b in bytes {
- let cls = ByteClass::new(vec![
- ByteRange { start: b, end: b },
- ]).case_fold();
- if !lits.add_byte_class(&cls) {
+ HirKind::Repetition(ref x) => {
+ match x.kind {
+ hir::RepetitionKind::ZeroOrOne => lits.cut(),
+ hir::RepetitionKind::ZeroOrMore => lits.cut(),
+ hir::RepetitionKind::OneOrMore => {
+ union_required(&x.hir, lits);
lits.cut();
- return;
+ }
+ hir::RepetitionKind::Range(ref rng) => {
+ let (min, max) = match *rng {
+ hir::RepetitionRange::Exactly(m) => (m, Some(m)),
+ hir::RepetitionRange::AtLeast(m) => (m, None),
+ hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
+ };
+ repeat_range_literals(
+ &x.hir, min, max, x.greedy, lits, union_required);
}
}
}
- Class(_) => {
- lits.cut();
- }
- ClassBytes(_) => {
- lits.cut();
+ HirKind::Concat(ref es) if es.is_empty() => {}
+ HirKind::Concat(ref es) if es.len() == 1 => {
+ union_required(&es[0], lits)
}
- Group { ref e, .. } => {
- union_required(&**e, lits);
- }
- Repeat { r: Repeater::ZeroOrOne, .. } => lits.cut(),
- Repeat { r: Repeater::ZeroOrMore, .. } => lits.cut(),
- Repeat { ref e, r: Repeater::OneOrMore, .. } => {
- union_required(&**e, lits);
- lits.cut();
- }
- Repeat { ref e, r: Repeater::Range { min, max }, greedy } => {
- repeat_range_literals(
- &**e, min, max, greedy, lits, union_required);
- }
- Concat(ref es) if es.is_empty() => {}
- Concat(ref es) if es.len() == 1 => union_required(&es[0], lits),
- Concat(ref es) => {
+ HirKind::Concat(ref es) => {
for e in es {
let mut lits2 = lits.to_empty();
union_required(e, &mut lits2);
@@ -157,7 +147,6 @@ fn union_required(expr: &Expr, lits: &mut Literals) {
if lits2.contains_empty() {
lits.cut();
}
- // if !lits.union(lits2) {
if !lits.cross_product(&lits2) {
// If this expression couldn't yield any literal that
// could be extended, then we need to quit. Since we're
@@ -167,15 +156,15 @@ fn union_required(expr: &Expr, lits: &mut Literals) {
}
}
}
- Alternate(ref es) => {
+ HirKind::Alternation(ref es) => {
alternate_literals(es, lits, union_required);
}
_ => lits.cut(),
}
}
-fn repeat_range_literals<F: FnMut(&Expr, &mut Literals)>(
- e: &Expr,
+fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
+ e: &Hir,
min: u32,
max: Option<u32>,
_greedy: bool,
@@ -204,8 +193,8 @@ fn repeat_range_literals<F: FnMut(&Expr, &mut Literals)>(
}
}
-fn alternate_literals<F: FnMut(&Expr, &mut Literals)>(
- es: &[Expr],
+fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
+ es: &[Hir],
lits: &mut Literals,
mut f: F,
) {
@@ -234,11 +223,21 @@ fn alternate_literals<F: FnMut(&Expr, &mut Literals)>(
}
lits.cut();
if !lcs.is_empty() {
- lits.add(Lit::empty());
- lits.add(Lit::new(lcs.to_vec()));
+ lits.add(Literal::empty());
+ lits.add(Literal::new(lcs.to_vec()));
}
}
+/// Return the number of characters in the given class.
+fn count_unicode_class(cls: &hir::ClassUnicode) -> u32 {
+ cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
+}
+
+/// Return the number of bytes in the given class.
+fn count_byte_class(cls: &hir::ClassBytes) -> u32 {
+ cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
+}
+
/// Converts an arbitrary sequence of bytes to a literal suitable for building
/// a regular expression.
fn bytes_to_regex(bs: &[u8]) -> String {
diff --git a/src/nonl.rs b/src/nonl.rs
index 361b0b003eb7..3beb5f61ce57 100644
--- a/src/nonl.rs
+++ b/src/nonl.rs
@@ -1,4 +1,4 @@
-use syntax::Expr;
+use syntax::hir::{self, Hir, HirKind};
use {Error, Result};
@@ -9,59 +9,66 @@ use {Error, Result};
///
/// If `byte` is not an ASCII character (i.e., greater than `0x7F`), then this
/// function panics.
-pub fn remove(expr: Expr, byte: u8) -> Result<Expr> {
- // TODO(burntsushi): There is a bug in this routine where only `\n` is
- // handled correctly. Namely, `AnyChar` and `AnyByte` need to be translated
- // to proper character classes instead of the special `AnyCharNoNL` and
- // `AnyByteNoNL` classes.
- use syntax::Expr::*;
+pub fn remove(expr: Hir, byte: u8) -> Result<Hir> {
assert!(byte <= 0x7F);
let chr = byte as char;
assert!(chr.len_utf8() == 1);
- Ok(match expr {
- Literal { chars, casei } => {
- if chars.iter().position(|&c| c == chr).is_some() {
+ Ok(match expr.into_kind() {
+ HirKind::Empty => Hir::empty(),
+ HirKind::Literal(hir::Literal::Unicode(c)) => {
+ if c == chr {
return Err(Error::LiteralNotAllowed(chr));
}
- Literal { chars: chars, casei: casei }
+ Hir::literal(hir::Literal::Unicode(c))
}
- LiteralBytes { bytes, casei } => {
- if bytes.iter().position(|&b| b == byte).is_some() {
+ HirKind::Literal(hir::Literal::Byte(b)) => {
+ if b as char == chr {
return Err(Error::LiteralNotAllowed(chr));
}
- LiteralBytes { bytes: bytes, casei: casei }
+ Hir::literal(hir::Literal::Byte(b))
}
- AnyChar => AnyCharNoNL,
- AnyByte => AnyByteNoNL,
- Class(mut cls) => {
- cls.remove(chr);
- Class(cls)
- }
- ClassBytes(mut cls) => {
- cls.remove(byte);
- ClassBytes(cls)
- }
- Group { e, i, name } => {
- Group {
- e: Box::new(remove(*e, byte)?),
- i: i,
- name: name,
+ HirKind::Class(hir::Class::Unicode(mut cls)) => {
+ let remove = hir::ClassUnicode::new(Some(
+ hir::ClassUnicodeRange::new(chr, chr),
+ ));
+ cls.difference(&remove);
+ if cls.iter().next().is_none() {
+ return Err(Error::LiteralNotAllowed(chr));
}
+ Hir::class(hir::Class::Unicode(cls))
}
- Repeat { e, r, greedy } => {
- Repeat {
- e: Box::new(remove(*e, byte)?),
- r: r,
- greedy: greedy,
+ HirKind::Class(hir::Class::Bytes(mut cls)) => {
+ let remove = hir::ClassBytes::new(Some(
+ hir::ClassBytesRange::new(byte, byte),
+ ));
+ cls.difference(&remove);
+ if cls.iter().next().is_none() {
+ return Err(Error::LiteralNotAllowed(chr));
}
+ Hir::class(hir::Class::Bytes(cls))
+ }
+ HirKind::Anchor(x) => Hir::anchor(x),
+ HirKind::WordBoundary(x) => Hir::word_boundary(x),
+ HirKind::Repetition(mut x) => {
+ x.hir = Box::new(remove(*x.hir, byte)?);
+ Hir::repetition(x)
+ }
+ HirKind::Group(mut x) => {
+ x.hir = Box::new(remove(*x.hir, byte)?);
+ Hir::group(x)
}
- Concat(exprs) => {
- Concat(exprs.into_iter().map(|e| remove(e, byte)).collect::<Result<Vec<Expr>>>()?)
+ HirKind::Concat(xs) => {
+ let xs = xs.into_iter()
+ .map(|e| remove(e, byte))
+ .collect::<Result<Vec<Hir>>>()?;
+ Hir::concat(xs)
}
- Alternate(exprs) => {
- Alternate(exprs.into_iter().map(|e| remove(e, byte)).collect::<Result<Vec<Expr>>>()?)
+ HirKind::Alternation(xs) => {
+ let xs = xs.into_iter()
+ .map(|e| remove(e, byte))
+ .collect::<Result<Vec<Hir>>>()?;
+ Hir::alternation(xs)
}
- e => e,
})
}
diff --git a/src/search.rs b/src/search.rs
index 8d056796ac14..1d5d7e29cccf 100644
--- a/src/search.rs
+++ b/src/search.rs
@@ -1,10 +1,10 @@
use memchr::{memchr, memrchr};
use regex::bytes::{Regex, RegexBuilder};
-use syntax;
use literals::LiteralSets;
use nonl;
-use syntax::Expr;
+use syntax::ParserBuilder;
+use syntax::hir::Hir;
use word_boundary::strip_unicode_word_boundaries;
use Result;
@@ -166,7 +166,7 @@ impl GrepBuilder {
/// Creates a new regex from the given expression with the current
/// configuration.
- fn regex(&self, expr: &Expr) -> Result<Regex> {
+ fn regex(&self, expr: &Hir) -> Result<Regex> {
let mut builder = RegexBuilder::new(&expr.to_string());
builder.unicode(true);
self.regex_build(builder)
@@ -184,15 +184,16 @@ impl GrepBuilder {
/// Parses the underlying pattern and ensures the pattern can never match
/// the line terminator.
- fn parse(&self) -> Result<syntax::Expr> {
- let expr =
- syntax::ExprBuilder::new()
- .allow_bytes(true)
- .unicode(true)
+ fn parse(&self) -> Result<Hir> {
+ let expr = ParserBuilder::new()
+ .allow_invalid_utf8(true)
.case_insensitive(self.is_case_insensitive()?)
+ .multi_line(true)
+ .build()
.parse(&self.pattern)?;
+ debug!("original regex HIR pattern:\n{}", expr);
let expr = nonl::remove(expr, self.opts.line_terminator)?;
- debug!("regex ast:\n{:#?}", expr);
+ debug!("transformed regex HIR pattern:\n{}", expr);
Ok(expr)
}
diff --git a/src/word_boundary.rs b/src/word_boundary.rs
index 6df5c6574933..8e6b86d12df8 100644
--- a/src/word_boundary.rs
+++ b/src/word_boundary.rs
@@ -1,4 +1,4 @@
-use syntax::Expr;
+use syntax::hir::{self, Hir, HirKind};
/// Strips Unicode word boundaries from the given expression.
///
@@ -8,7 +8,7 @@ use syntax::Expr;
/// false negatives.
///
/// If no word boundaries could be stripped, then None is returned.
-pub fn strip_unicode_word_boundaries(expr: &Expr) -> Option<Expr> {
+pub fn strip_unicode_word_boundaries(expr: &Hir) -> Option<Hir> {
// The real reason we do this is because Unicode word boundaries are the
// one thing that Rust's regex DFA engine can't handle. When it sees a
// Unicode word boundary among non-ASCII text, it falls back to one of the
@@ -16,23 +16,24 @@ pub fn strip_unicode_word_boundaries(expr: &Expr) -> Option<Expr> {
// a regex to find candidate matches without a Unicode word boundary. We'll
// only then use the full (and slower) regex to confirm a candidate as a
// match or not during search.
- use syntax::Expr::*;
-
- match *expr {
- Concat(ref es) if !es.is_empty() => {
+ //
+ // It looks like we only check the outer edges for `\b`? I guess this is
+ // an attempt to optimize for the `-w/--word-regexp` flag? ---AG
+ match *expr.kind() {
+ HirKind::Concat(ref es) if !es.is_empty() => {
let first = is_unicode_word_boundary(&es[0]);
let last = is_unicode_word_boundary(es.last().unwrap());
// Be careful not to strip word boundaries if there are no other
// expressions to match.
match (first, last) {
(true, false) if es.len() > 1 => {
- Some(Concat(es[1..].to_vec()))
+ Some(Hir::concat(es[1..].to_vec()))
}
(false, true) if es.len() > 1 => {
- Some(Concat(es[..es.len() - 1].to_vec()))
+ Some(Hir::concat(es[..es.len() - 1].to_vec()))
}
(true, true) if es.len() > 2 => {
- Some(Concat(es[1..es.len() - 1].to_vec()))
+ Some(Hir::concat(es[1..es.len() - 1].to_vec()))
}
_ => None,
}
@@ -42,13 +43,11 @@ pub fn strip_unicode_word_boundaries(expr: &Expr) -> Option<Expr> {
}
/// Returns true if the given expression is a Unicode word boundary.
-fn is_unicode_word_boundary(expr: &Expr) -> bool {
- use syntax::Expr::*;
-
- match *expr {
- WordBoundary => true,
- NotWordBoundary => true,
- Group { ref e, .. } => is_unicode_word_boundary(e),
+fn is_unicode_word_boundary(expr: &Hir) -> bool {
+ match *expr.kind() {
+ HirKind::WordBoundary(hir::WordBoundary::Unicode) => true,
+ HirKind::WordBoundary(hir::WordBoundary::UnicodeNegate) => true,
+ HirKind::Group(ref x) => is_unicode_word_boundary(&x.hir),
_ => false,
}
}
--
2.14.3