From cd08707c7c82058559bd5557efb3c1d0379dbf1d Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Tue, 13 Mar 2018 20:38:50 -0400 Subject: [PATCH 1/2] grep: upgrade to regex-syntax 0.5 This update brings with it many bug fixes: * Better error messages are printed overall. We also include explicit call out for unsupported features like backreferences and look-around. * Regexes like `\s*{` no longer emit incomprehensible errors. * Unicode escape sequences, such as `\u{..}` are now supported. For the most part, this upgrade was done in a straight-forward way. We resist the urge to refactor the `grep` crate, in anticipation of it being rewritten anyway. Note that we removed the `--fixed-strings` suggestion whenever a regex syntax error occurs. In practice, I've found that it results in a lot of false positives, and I believe that its use is not as paramount now that regex parse errors are much more readable. Closes #268, Closes #395, Closes #702, Closes #853 --- src/literals.rs | 119 +++++++++++++++++++++++----------------------- src/nonl.rs | 85 ++++++++++++++++++--------------- src/search.rs | 19 ++++---- src/word_boundary.rs | 31 ++++++------ 4 files changed, 130 insertions(+), 124 deletions(-) diff --git a/src/literals.rs b/src/literals.rs index eebeac4c7249..3e1c385bcda9 100644 --- a/src/literals.rs +++ b/src/literals.rs @@ -10,10 +10,8 @@ principled. use std::cmp; use regex::bytes::RegexBuilder; -use syntax::{ - Expr, Literals, Lit, - ByteClass, ByteRange, CharClass, ClassRange, Repeater, -}; +use syntax::hir::{self, Hir, HirKind}; +use syntax::hir::literal::{Literal, Literals}; #[derive(Clone, Debug)] pub struct LiteralSets { @@ -23,12 +21,12 @@ pub struct LiteralSets { } impl LiteralSets { - pub fn create(expr: &Expr) -> Self { + pub fn create(expr: &Hir) -> Self { let mut required = Literals::empty(); union_required(expr, &mut required); LiteralSets { - prefixes: expr.prefixes(), - suffixes: expr.suffixes(), + prefixes: Literals::prefixes(expr), + suffixes: Literals::suffixes(expr), required: required, } } @@ -93,60 +91,52 @@ impl LiteralSets { } } -fn union_required(expr: &Expr, lits: &mut Literals) { - use syntax::Expr::*; - match *expr { - Literal { ref chars, casei: false } => { - let s: String = chars.iter().cloned().collect(); - lits.cross_add(s.as_bytes()); +fn union_required(expr: &Hir, lits: &mut Literals) { + match *expr.kind() { + HirKind::Literal(hir::Literal::Unicode(c)) => { + let mut buf = [0u8; 4]; + lits.cross_add(c.encode_utf8(&mut buf).as_bytes()); } - Literal { ref chars, casei: true } => { - for &c in chars { - let cls = CharClass::new(vec![ - ClassRange { start: c, end: c }, - ]).case_fold(); - if !lits.add_char_class(&cls) { - lits.cut(); - return; - } + HirKind::Literal(hir::Literal::Byte(b)) => { + lits.cross_add(&[b]); + } + HirKind::Class(hir::Class::Unicode(ref cls)) => { + if count_unicode_class(cls) >= 5 || !lits.add_char_class(cls) { + lits.cut(); + } + } + HirKind::Class(hir::Class::Bytes(ref cls)) => { + if count_byte_class(cls) >= 5 || !lits.add_byte_class(cls) { + lits.cut(); } } - LiteralBytes { ref bytes, casei: false } => { - lits.cross_add(bytes); + HirKind::Group(hir::Group { ref hir, .. }) => { + union_required(&**hir, lits); } - LiteralBytes { ref bytes, casei: true } => { - for &b in bytes { - let cls = ByteClass::new(vec![ - ByteRange { start: b, end: b }, - ]).case_fold(); - if !lits.add_byte_class(&cls) { + HirKind::Repetition(ref x) => { + match x.kind { + hir::RepetitionKind::ZeroOrOne => lits.cut(), + hir::RepetitionKind::ZeroOrMore => lits.cut(), + hir::RepetitionKind::OneOrMore => { + union_required(&x.hir, lits); lits.cut(); - return; + } + hir::RepetitionKind::Range(ref rng) => { + let (min, max) = match *rng { + hir::RepetitionRange::Exactly(m) => (m, Some(m)), + hir::RepetitionRange::AtLeast(m) => (m, None), + hir::RepetitionRange::Bounded(m, n) => (m, Some(n)), + }; + repeat_range_literals( + &x.hir, min, max, x.greedy, lits, union_required); } } } - Class(_) => { - lits.cut(); - } - ClassBytes(_) => { - lits.cut(); + HirKind::Concat(ref es) if es.is_empty() => {} + HirKind::Concat(ref es) if es.len() == 1 => { + union_required(&es[0], lits) } - Group { ref e, .. } => { - union_required(&**e, lits); - } - Repeat { r: Repeater::ZeroOrOne, .. } => lits.cut(), - Repeat { r: Repeater::ZeroOrMore, .. } => lits.cut(), - Repeat { ref e, r: Repeater::OneOrMore, .. } => { - union_required(&**e, lits); - lits.cut(); - } - Repeat { ref e, r: Repeater::Range { min, max }, greedy } => { - repeat_range_literals( - &**e, min, max, greedy, lits, union_required); - } - Concat(ref es) if es.is_empty() => {} - Concat(ref es) if es.len() == 1 => union_required(&es[0], lits), - Concat(ref es) => { + HirKind::Concat(ref es) => { for e in es { let mut lits2 = lits.to_empty(); union_required(e, &mut lits2); @@ -157,7 +147,6 @@ fn union_required(expr: &Expr, lits: &mut Literals) { if lits2.contains_empty() { lits.cut(); } - // if !lits.union(lits2) { if !lits.cross_product(&lits2) { // If this expression couldn't yield any literal that // could be extended, then we need to quit. Since we're @@ -167,15 +156,15 @@ fn union_required(expr: &Expr, lits: &mut Literals) { } } } - Alternate(ref es) => { + HirKind::Alternation(ref es) => { alternate_literals(es, lits, union_required); } _ => lits.cut(), } } -fn repeat_range_literals( - e: &Expr, +fn repeat_range_literals( + e: &Hir, min: u32, max: Option, _greedy: bool, @@ -204,8 +193,8 @@ fn repeat_range_literals( } } -fn alternate_literals( - es: &[Expr], +fn alternate_literals( + es: &[Hir], lits: &mut Literals, mut f: F, ) { @@ -234,11 +223,21 @@ fn alternate_literals( } lits.cut(); if !lcs.is_empty() { - lits.add(Lit::empty()); - lits.add(Lit::new(lcs.to_vec())); + lits.add(Literal::empty()); + lits.add(Literal::new(lcs.to_vec())); } } +/// Return the number of characters in the given class. +fn count_unicode_class(cls: &hir::ClassUnicode) -> u32 { + cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum() +} + +/// Return the number of bytes in the given class. +fn count_byte_class(cls: &hir::ClassBytes) -> u32 { + cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum() +} + /// Converts an arbitrary sequence of bytes to a literal suitable for building /// a regular expression. fn bytes_to_regex(bs: &[u8]) -> String { diff --git a/src/nonl.rs b/src/nonl.rs index 361b0b003eb7..3beb5f61ce57 100644 --- a/src/nonl.rs +++ b/src/nonl.rs @@ -1,4 +1,4 @@ -use syntax::Expr; +use syntax::hir::{self, Hir, HirKind}; use {Error, Result}; @@ -9,59 +9,66 @@ use {Error, Result}; /// /// If `byte` is not an ASCII character (i.e., greater than `0x7F`), then this /// function panics. -pub fn remove(expr: Expr, byte: u8) -> Result { - // TODO(burntsushi): There is a bug in this routine where only `\n` is - // handled correctly. Namely, `AnyChar` and `AnyByte` need to be translated - // to proper character classes instead of the special `AnyCharNoNL` and - // `AnyByteNoNL` classes. - use syntax::Expr::*; +pub fn remove(expr: Hir, byte: u8) -> Result { assert!(byte <= 0x7F); let chr = byte as char; assert!(chr.len_utf8() == 1); - Ok(match expr { - Literal { chars, casei } => { - if chars.iter().position(|&c| c == chr).is_some() { + Ok(match expr.into_kind() { + HirKind::Empty => Hir::empty(), + HirKind::Literal(hir::Literal::Unicode(c)) => { + if c == chr { return Err(Error::LiteralNotAllowed(chr)); } - Literal { chars: chars, casei: casei } + Hir::literal(hir::Literal::Unicode(c)) } - LiteralBytes { bytes, casei } => { - if bytes.iter().position(|&b| b == byte).is_some() { + HirKind::Literal(hir::Literal::Byte(b)) => { + if b as char == chr { return Err(Error::LiteralNotAllowed(chr)); } - LiteralBytes { bytes: bytes, casei: casei } + Hir::literal(hir::Literal::Byte(b)) } - AnyChar => AnyCharNoNL, - AnyByte => AnyByteNoNL, - Class(mut cls) => { - cls.remove(chr); - Class(cls) - } - ClassBytes(mut cls) => { - cls.remove(byte); - ClassBytes(cls) - } - Group { e, i, name } => { - Group { - e: Box::new(remove(*e, byte)?), - i: i, - name: name, + HirKind::Class(hir::Class::Unicode(mut cls)) => { + let remove = hir::ClassUnicode::new(Some( + hir::ClassUnicodeRange::new(chr, chr), + )); + cls.difference(&remove); + if cls.iter().next().is_none() { + return Err(Error::LiteralNotAllowed(chr)); } + Hir::class(hir::Class::Unicode(cls)) } - Repeat { e, r, greedy } => { - Repeat { - e: Box::new(remove(*e, byte)?), - r: r, - greedy: greedy, + HirKind::Class(hir::Class::Bytes(mut cls)) => { + let remove = hir::ClassBytes::new(Some( + hir::ClassBytesRange::new(byte, byte), + )); + cls.difference(&remove); + if cls.iter().next().is_none() { + return Err(Error::LiteralNotAllowed(chr)); } + Hir::class(hir::Class::Bytes(cls)) + } + HirKind::Anchor(x) => Hir::anchor(x), + HirKind::WordBoundary(x) => Hir::word_boundary(x), + HirKind::Repetition(mut x) => { + x.hir = Box::new(remove(*x.hir, byte)?); + Hir::repetition(x) + } + HirKind::Group(mut x) => { + x.hir = Box::new(remove(*x.hir, byte)?); + Hir::group(x) } - Concat(exprs) => { - Concat(exprs.into_iter().map(|e| remove(e, byte)).collect::>>()?) + HirKind::Concat(xs) => { + let xs = xs.into_iter() + .map(|e| remove(e, byte)) + .collect::>>()?; + Hir::concat(xs) } - Alternate(exprs) => { - Alternate(exprs.into_iter().map(|e| remove(e, byte)).collect::>>()?) + HirKind::Alternation(xs) => { + let xs = xs.into_iter() + .map(|e| remove(e, byte)) + .collect::>>()?; + Hir::alternation(xs) } - e => e, }) } diff --git a/src/search.rs b/src/search.rs index 8d056796ac14..1d5d7e29cccf 100644 --- a/src/search.rs +++ b/src/search.rs @@ -1,10 +1,10 @@ use memchr::{memchr, memrchr}; use regex::bytes::{Regex, RegexBuilder}; -use syntax; use literals::LiteralSets; use nonl; -use syntax::Expr; +use syntax::ParserBuilder; +use syntax::hir::Hir; use word_boundary::strip_unicode_word_boundaries; use Result; @@ -166,7 +166,7 @@ impl GrepBuilder { /// Creates a new regex from the given expression with the current /// configuration. - fn regex(&self, expr: &Expr) -> Result { + fn regex(&self, expr: &Hir) -> Result { let mut builder = RegexBuilder::new(&expr.to_string()); builder.unicode(true); self.regex_build(builder) @@ -184,15 +184,16 @@ impl GrepBuilder { /// Parses the underlying pattern and ensures the pattern can never match /// the line terminator. - fn parse(&self) -> Result { - let expr = - syntax::ExprBuilder::new() - .allow_bytes(true) - .unicode(true) + fn parse(&self) -> Result { + let expr = ParserBuilder::new() + .allow_invalid_utf8(true) .case_insensitive(self.is_case_insensitive()?) + .multi_line(true) + .build() .parse(&self.pattern)?; + debug!("original regex HIR pattern:\n{}", expr); let expr = nonl::remove(expr, self.opts.line_terminator)?; - debug!("regex ast:\n{:#?}", expr); + debug!("transformed regex HIR pattern:\n{}", expr); Ok(expr) } diff --git a/src/word_boundary.rs b/src/word_boundary.rs index 6df5c6574933..8e6b86d12df8 100644 --- a/src/word_boundary.rs +++ b/src/word_boundary.rs @@ -1,4 +1,4 @@ -use syntax::Expr; +use syntax::hir::{self, Hir, HirKind}; /// Strips Unicode word boundaries from the given expression. /// @@ -8,7 +8,7 @@ use syntax::Expr; /// false negatives. /// /// If no word boundaries could be stripped, then None is returned. -pub fn strip_unicode_word_boundaries(expr: &Expr) -> Option { +pub fn strip_unicode_word_boundaries(expr: &Hir) -> Option { // The real reason we do this is because Unicode word boundaries are the // one thing that Rust's regex DFA engine can't handle. When it sees a // Unicode word boundary among non-ASCII text, it falls back to one of the @@ -16,23 +16,24 @@ pub fn strip_unicode_word_boundaries(expr: &Expr) -> Option { // a regex to find candidate matches without a Unicode word boundary. We'll // only then use the full (and slower) regex to confirm a candidate as a // match or not during search. - use syntax::Expr::*; - - match *expr { - Concat(ref es) if !es.is_empty() => { + // + // It looks like we only check the outer edges for `\b`? I guess this is + // an attempt to optimize for the `-w/--word-regexp` flag? ---AG + match *expr.kind() { + HirKind::Concat(ref es) if !es.is_empty() => { let first = is_unicode_word_boundary(&es[0]); let last = is_unicode_word_boundary(es.last().unwrap()); // Be careful not to strip word boundaries if there are no other // expressions to match. match (first, last) { (true, false) if es.len() > 1 => { - Some(Concat(es[1..].to_vec())) + Some(Hir::concat(es[1..].to_vec())) } (false, true) if es.len() > 1 => { - Some(Concat(es[..es.len() - 1].to_vec())) + Some(Hir::concat(es[..es.len() - 1].to_vec())) } (true, true) if es.len() > 2 => { - Some(Concat(es[1..es.len() - 1].to_vec())) + Some(Hir::concat(es[1..es.len() - 1].to_vec())) } _ => None, } @@ -42,13 +43,11 @@ pub fn strip_unicode_word_boundaries(expr: &Expr) -> Option { } /// Returns true if the given expression is a Unicode word boundary. -fn is_unicode_word_boundary(expr: &Expr) -> bool { - use syntax::Expr::*; - - match *expr { - WordBoundary => true, - NotWordBoundary => true, - Group { ref e, .. } => is_unicode_word_boundary(e), +fn is_unicode_word_boundary(expr: &Hir) -> bool { + match *expr.kind() { + HirKind::WordBoundary(hir::WordBoundary::Unicode) => true, + HirKind::WordBoundary(hir::WordBoundary::UnicodeNegate) => true, + HirKind::Group(ref x) => is_unicode_word_boundary(&x.hir), _ => false, } } -- 2.14.3