parent
fc7616ce77
commit
d77b3dcd43
@ -1,2 +1,3 @@
|
|||||||
/grep-0.1.7.crate
|
/grep-0.1.7.crate
|
||||||
/grep-0.1.8.crate
|
/grep-0.1.8.crate
|
||||||
|
/grep-0.1.9.crate
|
||||||
|
@ -1,460 +0,0 @@
|
|||||||
From cd08707c7c82058559bd5557efb3c1d0379dbf1d Mon Sep 17 00:00:00 2001
|
|
||||||
From: Andrew Gallant <jamslam@gmail.com>
|
|
||||||
Date: Tue, 13 Mar 2018 20:38:50 -0400
|
|
||||||
Subject: [PATCH 1/2] grep: upgrade to regex-syntax 0.5
|
|
||||||
|
|
||||||
This update brings with it many bug fixes:
|
|
||||||
|
|
||||||
* Better error messages are printed overall. We also include
|
|
||||||
explicit call out for unsupported features like backreferences
|
|
||||||
and look-around.
|
|
||||||
* Regexes like `\s*{` no longer emit incomprehensible errors.
|
|
||||||
* Unicode escape sequences, such as `\u{..}` are now supported.
|
|
||||||
|
|
||||||
For the most part, this upgrade was done in a straight-forward way. We
|
|
||||||
resist the urge to refactor the `grep` crate, in anticipation of it
|
|
||||||
being rewritten anyway.
|
|
||||||
|
|
||||||
Note that we removed the `--fixed-strings` suggestion whenever a regex
|
|
||||||
syntax error occurs. In practice, I've found that it results in a lot of
|
|
||||||
false positives, and I believe that its use is not as paramount now that
|
|
||||||
regex parse errors are much more readable.
|
|
||||||
|
|
||||||
Closes #268, Closes #395, Closes #702, Closes #853
|
|
||||||
---
|
|
||||||
src/literals.rs | 119 +++++++++++++++++++++++-----------------------
|
|
||||||
src/nonl.rs | 85 ++++++++++++++++++---------------
|
|
||||||
src/search.rs | 19 ++++----
|
|
||||||
src/word_boundary.rs | 31 ++++++------
|
|
||||||
4 files changed, 130 insertions(+), 124 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/src/literals.rs b/src/literals.rs
|
|
||||||
index eebeac4c7249..3e1c385bcda9 100644
|
|
||||||
--- a/src/literals.rs
|
|
||||||
+++ b/src/literals.rs
|
|
||||||
@@ -10,10 +10,8 @@ principled.
|
|
||||||
use std::cmp;
|
|
||||||
|
|
||||||
use regex::bytes::RegexBuilder;
|
|
||||||
-use syntax::{
|
|
||||||
- Expr, Literals, Lit,
|
|
||||||
- ByteClass, ByteRange, CharClass, ClassRange, Repeater,
|
|
||||||
-};
|
|
||||||
+use syntax::hir::{self, Hir, HirKind};
|
|
||||||
+use syntax::hir::literal::{Literal, Literals};
|
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
|
||||||
pub struct LiteralSets {
|
|
||||||
@@ -23,12 +21,12 @@ pub struct LiteralSets {
|
|
||||||
}
|
|
||||||
|
|
||||||
impl LiteralSets {
|
|
||||||
- pub fn create(expr: &Expr) -> Self {
|
|
||||||
+ pub fn create(expr: &Hir) -> Self {
|
|
||||||
let mut required = Literals::empty();
|
|
||||||
union_required(expr, &mut required);
|
|
||||||
LiteralSets {
|
|
||||||
- prefixes: expr.prefixes(),
|
|
||||||
- suffixes: expr.suffixes(),
|
|
||||||
+ prefixes: Literals::prefixes(expr),
|
|
||||||
+ suffixes: Literals::suffixes(expr),
|
|
||||||
required: required,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -93,60 +91,52 @@ impl LiteralSets {
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
-fn union_required(expr: &Expr, lits: &mut Literals) {
|
|
||||||
- use syntax::Expr::*;
|
|
||||||
- match *expr {
|
|
||||||
- Literal { ref chars, casei: false } => {
|
|
||||||
- let s: String = chars.iter().cloned().collect();
|
|
||||||
- lits.cross_add(s.as_bytes());
|
|
||||||
+fn union_required(expr: &Hir, lits: &mut Literals) {
|
|
||||||
+ match *expr.kind() {
|
|
||||||
+ HirKind::Literal(hir::Literal::Unicode(c)) => {
|
|
||||||
+ let mut buf = [0u8; 4];
|
|
||||||
+ lits.cross_add(c.encode_utf8(&mut buf).as_bytes());
|
|
||||||
}
|
|
||||||
- Literal { ref chars, casei: true } => {
|
|
||||||
- for &c in chars {
|
|
||||||
- let cls = CharClass::new(vec![
|
|
||||||
- ClassRange { start: c, end: c },
|
|
||||||
- ]).case_fold();
|
|
||||||
- if !lits.add_char_class(&cls) {
|
|
||||||
- lits.cut();
|
|
||||||
- return;
|
|
||||||
- }
|
|
||||||
+ HirKind::Literal(hir::Literal::Byte(b)) => {
|
|
||||||
+ lits.cross_add(&[b]);
|
|
||||||
+ }
|
|
||||||
+ HirKind::Class(hir::Class::Unicode(ref cls)) => {
|
|
||||||
+ if count_unicode_class(cls) >= 5 || !lits.add_char_class(cls) {
|
|
||||||
+ lits.cut();
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+ HirKind::Class(hir::Class::Bytes(ref cls)) => {
|
|
||||||
+ if count_byte_class(cls) >= 5 || !lits.add_byte_class(cls) {
|
|
||||||
+ lits.cut();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
- LiteralBytes { ref bytes, casei: false } => {
|
|
||||||
- lits.cross_add(bytes);
|
|
||||||
+ HirKind::Group(hir::Group { ref hir, .. }) => {
|
|
||||||
+ union_required(&**hir, lits);
|
|
||||||
}
|
|
||||||
- LiteralBytes { ref bytes, casei: true } => {
|
|
||||||
- for &b in bytes {
|
|
||||||
- let cls = ByteClass::new(vec![
|
|
||||||
- ByteRange { start: b, end: b },
|
|
||||||
- ]).case_fold();
|
|
||||||
- if !lits.add_byte_class(&cls) {
|
|
||||||
+ HirKind::Repetition(ref x) => {
|
|
||||||
+ match x.kind {
|
|
||||||
+ hir::RepetitionKind::ZeroOrOne => lits.cut(),
|
|
||||||
+ hir::RepetitionKind::ZeroOrMore => lits.cut(),
|
|
||||||
+ hir::RepetitionKind::OneOrMore => {
|
|
||||||
+ union_required(&x.hir, lits);
|
|
||||||
lits.cut();
|
|
||||||
- return;
|
|
||||||
+ }
|
|
||||||
+ hir::RepetitionKind::Range(ref rng) => {
|
|
||||||
+ let (min, max) = match *rng {
|
|
||||||
+ hir::RepetitionRange::Exactly(m) => (m, Some(m)),
|
|
||||||
+ hir::RepetitionRange::AtLeast(m) => (m, None),
|
|
||||||
+ hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
|
|
||||||
+ };
|
|
||||||
+ repeat_range_literals(
|
|
||||||
+ &x.hir, min, max, x.greedy, lits, union_required);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
- Class(_) => {
|
|
||||||
- lits.cut();
|
|
||||||
- }
|
|
||||||
- ClassBytes(_) => {
|
|
||||||
- lits.cut();
|
|
||||||
+ HirKind::Concat(ref es) if es.is_empty() => {}
|
|
||||||
+ HirKind::Concat(ref es) if es.len() == 1 => {
|
|
||||||
+ union_required(&es[0], lits)
|
|
||||||
}
|
|
||||||
- Group { ref e, .. } => {
|
|
||||||
- union_required(&**e, lits);
|
|
||||||
- }
|
|
||||||
- Repeat { r: Repeater::ZeroOrOne, .. } => lits.cut(),
|
|
||||||
- Repeat { r: Repeater::ZeroOrMore, .. } => lits.cut(),
|
|
||||||
- Repeat { ref e, r: Repeater::OneOrMore, .. } => {
|
|
||||||
- union_required(&**e, lits);
|
|
||||||
- lits.cut();
|
|
||||||
- }
|
|
||||||
- Repeat { ref e, r: Repeater::Range { min, max }, greedy } => {
|
|
||||||
- repeat_range_literals(
|
|
||||||
- &**e, min, max, greedy, lits, union_required);
|
|
||||||
- }
|
|
||||||
- Concat(ref es) if es.is_empty() => {}
|
|
||||||
- Concat(ref es) if es.len() == 1 => union_required(&es[0], lits),
|
|
||||||
- Concat(ref es) => {
|
|
||||||
+ HirKind::Concat(ref es) => {
|
|
||||||
for e in es {
|
|
||||||
let mut lits2 = lits.to_empty();
|
|
||||||
union_required(e, &mut lits2);
|
|
||||||
@@ -157,7 +147,6 @@ fn union_required(expr: &Expr, lits: &mut Literals) {
|
|
||||||
if lits2.contains_empty() {
|
|
||||||
lits.cut();
|
|
||||||
}
|
|
||||||
- // if !lits.union(lits2) {
|
|
||||||
if !lits.cross_product(&lits2) {
|
|
||||||
// If this expression couldn't yield any literal that
|
|
||||||
// could be extended, then we need to quit. Since we're
|
|
||||||
@@ -167,15 +156,15 @@ fn union_required(expr: &Expr, lits: &mut Literals) {
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
- Alternate(ref es) => {
|
|
||||||
+ HirKind::Alternation(ref es) => {
|
|
||||||
alternate_literals(es, lits, union_required);
|
|
||||||
}
|
|
||||||
_ => lits.cut(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
-fn repeat_range_literals<F: FnMut(&Expr, &mut Literals)>(
|
|
||||||
- e: &Expr,
|
|
||||||
+fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
|
|
||||||
+ e: &Hir,
|
|
||||||
min: u32,
|
|
||||||
max: Option<u32>,
|
|
||||||
_greedy: bool,
|
|
||||||
@@ -204,8 +193,8 @@ fn repeat_range_literals<F: FnMut(&Expr, &mut Literals)>(
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
-fn alternate_literals<F: FnMut(&Expr, &mut Literals)>(
|
|
||||||
- es: &[Expr],
|
|
||||||
+fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
|
|
||||||
+ es: &[Hir],
|
|
||||||
lits: &mut Literals,
|
|
||||||
mut f: F,
|
|
||||||
) {
|
|
||||||
@@ -234,11 +223,21 @@ fn alternate_literals<F: FnMut(&Expr, &mut Literals)>(
|
|
||||||
}
|
|
||||||
lits.cut();
|
|
||||||
if !lcs.is_empty() {
|
|
||||||
- lits.add(Lit::empty());
|
|
||||||
- lits.add(Lit::new(lcs.to_vec()));
|
|
||||||
+ lits.add(Literal::empty());
|
|
||||||
+ lits.add(Literal::new(lcs.to_vec()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
+/// Return the number of characters in the given class.
|
|
||||||
+fn count_unicode_class(cls: &hir::ClassUnicode) -> u32 {
|
|
||||||
+ cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+/// Return the number of bytes in the given class.
|
|
||||||
+fn count_byte_class(cls: &hir::ClassBytes) -> u32 {
|
|
||||||
+ cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
/// Converts an arbitrary sequence of bytes to a literal suitable for building
|
|
||||||
/// a regular expression.
|
|
||||||
fn bytes_to_regex(bs: &[u8]) -> String {
|
|
||||||
diff --git a/src/nonl.rs b/src/nonl.rs
|
|
||||||
index 361b0b003eb7..3beb5f61ce57 100644
|
|
||||||
--- a/src/nonl.rs
|
|
||||||
+++ b/src/nonl.rs
|
|
||||||
@@ -1,4 +1,4 @@
|
|
||||||
-use syntax::Expr;
|
|
||||||
+use syntax::hir::{self, Hir, HirKind};
|
|
||||||
|
|
||||||
use {Error, Result};
|
|
||||||
|
|
||||||
@@ -9,59 +9,66 @@ use {Error, Result};
|
|
||||||
///
|
|
||||||
/// If `byte` is not an ASCII character (i.e., greater than `0x7F`), then this
|
|
||||||
/// function panics.
|
|
||||||
-pub fn remove(expr: Expr, byte: u8) -> Result<Expr> {
|
|
||||||
- // TODO(burntsushi): There is a bug in this routine where only `\n` is
|
|
||||||
- // handled correctly. Namely, `AnyChar` and `AnyByte` need to be translated
|
|
||||||
- // to proper character classes instead of the special `AnyCharNoNL` and
|
|
||||||
- // `AnyByteNoNL` classes.
|
|
||||||
- use syntax::Expr::*;
|
|
||||||
+pub fn remove(expr: Hir, byte: u8) -> Result<Hir> {
|
|
||||||
assert!(byte <= 0x7F);
|
|
||||||
let chr = byte as char;
|
|
||||||
assert!(chr.len_utf8() == 1);
|
|
||||||
|
|
||||||
- Ok(match expr {
|
|
||||||
- Literal { chars, casei } => {
|
|
||||||
- if chars.iter().position(|&c| c == chr).is_some() {
|
|
||||||
+ Ok(match expr.into_kind() {
|
|
||||||
+ HirKind::Empty => Hir::empty(),
|
|
||||||
+ HirKind::Literal(hir::Literal::Unicode(c)) => {
|
|
||||||
+ if c == chr {
|
|
||||||
return Err(Error::LiteralNotAllowed(chr));
|
|
||||||
}
|
|
||||||
- Literal { chars: chars, casei: casei }
|
|
||||||
+ Hir::literal(hir::Literal::Unicode(c))
|
|
||||||
}
|
|
||||||
- LiteralBytes { bytes, casei } => {
|
|
||||||
- if bytes.iter().position(|&b| b == byte).is_some() {
|
|
||||||
+ HirKind::Literal(hir::Literal::Byte(b)) => {
|
|
||||||
+ if b as char == chr {
|
|
||||||
return Err(Error::LiteralNotAllowed(chr));
|
|
||||||
}
|
|
||||||
- LiteralBytes { bytes: bytes, casei: casei }
|
|
||||||
+ Hir::literal(hir::Literal::Byte(b))
|
|
||||||
}
|
|
||||||
- AnyChar => AnyCharNoNL,
|
|
||||||
- AnyByte => AnyByteNoNL,
|
|
||||||
- Class(mut cls) => {
|
|
||||||
- cls.remove(chr);
|
|
||||||
- Class(cls)
|
|
||||||
- }
|
|
||||||
- ClassBytes(mut cls) => {
|
|
||||||
- cls.remove(byte);
|
|
||||||
- ClassBytes(cls)
|
|
||||||
- }
|
|
||||||
- Group { e, i, name } => {
|
|
||||||
- Group {
|
|
||||||
- e: Box::new(remove(*e, byte)?),
|
|
||||||
- i: i,
|
|
||||||
- name: name,
|
|
||||||
+ HirKind::Class(hir::Class::Unicode(mut cls)) => {
|
|
||||||
+ let remove = hir::ClassUnicode::new(Some(
|
|
||||||
+ hir::ClassUnicodeRange::new(chr, chr),
|
|
||||||
+ ));
|
|
||||||
+ cls.difference(&remove);
|
|
||||||
+ if cls.iter().next().is_none() {
|
|
||||||
+ return Err(Error::LiteralNotAllowed(chr));
|
|
||||||
}
|
|
||||||
+ Hir::class(hir::Class::Unicode(cls))
|
|
||||||
}
|
|
||||||
- Repeat { e, r, greedy } => {
|
|
||||||
- Repeat {
|
|
||||||
- e: Box::new(remove(*e, byte)?),
|
|
||||||
- r: r,
|
|
||||||
- greedy: greedy,
|
|
||||||
+ HirKind::Class(hir::Class::Bytes(mut cls)) => {
|
|
||||||
+ let remove = hir::ClassBytes::new(Some(
|
|
||||||
+ hir::ClassBytesRange::new(byte, byte),
|
|
||||||
+ ));
|
|
||||||
+ cls.difference(&remove);
|
|
||||||
+ if cls.iter().next().is_none() {
|
|
||||||
+ return Err(Error::LiteralNotAllowed(chr));
|
|
||||||
}
|
|
||||||
+ Hir::class(hir::Class::Bytes(cls))
|
|
||||||
+ }
|
|
||||||
+ HirKind::Anchor(x) => Hir::anchor(x),
|
|
||||||
+ HirKind::WordBoundary(x) => Hir::word_boundary(x),
|
|
||||||
+ HirKind::Repetition(mut x) => {
|
|
||||||
+ x.hir = Box::new(remove(*x.hir, byte)?);
|
|
||||||
+ Hir::repetition(x)
|
|
||||||
+ }
|
|
||||||
+ HirKind::Group(mut x) => {
|
|
||||||
+ x.hir = Box::new(remove(*x.hir, byte)?);
|
|
||||||
+ Hir::group(x)
|
|
||||||
}
|
|
||||||
- Concat(exprs) => {
|
|
||||||
- Concat(exprs.into_iter().map(|e| remove(e, byte)).collect::<Result<Vec<Expr>>>()?)
|
|
||||||
+ HirKind::Concat(xs) => {
|
|
||||||
+ let xs = xs.into_iter()
|
|
||||||
+ .map(|e| remove(e, byte))
|
|
||||||
+ .collect::<Result<Vec<Hir>>>()?;
|
|
||||||
+ Hir::concat(xs)
|
|
||||||
}
|
|
||||||
- Alternate(exprs) => {
|
|
||||||
- Alternate(exprs.into_iter().map(|e| remove(e, byte)).collect::<Result<Vec<Expr>>>()?)
|
|
||||||
+ HirKind::Alternation(xs) => {
|
|
||||||
+ let xs = xs.into_iter()
|
|
||||||
+ .map(|e| remove(e, byte))
|
|
||||||
+ .collect::<Result<Vec<Hir>>>()?;
|
|
||||||
+ Hir::alternation(xs)
|
|
||||||
}
|
|
||||||
- e => e,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
diff --git a/src/search.rs b/src/search.rs
|
|
||||||
index 8d056796ac14..1d5d7e29cccf 100644
|
|
||||||
--- a/src/search.rs
|
|
||||||
+++ b/src/search.rs
|
|
||||||
@@ -1,10 +1,10 @@
|
|
||||||
use memchr::{memchr, memrchr};
|
|
||||||
use regex::bytes::{Regex, RegexBuilder};
|
|
||||||
-use syntax;
|
|
||||||
|
|
||||||
use literals::LiteralSets;
|
|
||||||
use nonl;
|
|
||||||
-use syntax::Expr;
|
|
||||||
+use syntax::ParserBuilder;
|
|
||||||
+use syntax::hir::Hir;
|
|
||||||
use word_boundary::strip_unicode_word_boundaries;
|
|
||||||
use Result;
|
|
||||||
|
|
||||||
@@ -166,7 +166,7 @@ impl GrepBuilder {
|
|
||||||
|
|
||||||
/// Creates a new regex from the given expression with the current
|
|
||||||
/// configuration.
|
|
||||||
- fn regex(&self, expr: &Expr) -> Result<Regex> {
|
|
||||||
+ fn regex(&self, expr: &Hir) -> Result<Regex> {
|
|
||||||
let mut builder = RegexBuilder::new(&expr.to_string());
|
|
||||||
builder.unicode(true);
|
|
||||||
self.regex_build(builder)
|
|
||||||
@@ -184,15 +184,16 @@ impl GrepBuilder {
|
|
||||||
|
|
||||||
/// Parses the underlying pattern and ensures the pattern can never match
|
|
||||||
/// the line terminator.
|
|
||||||
- fn parse(&self) -> Result<syntax::Expr> {
|
|
||||||
- let expr =
|
|
||||||
- syntax::ExprBuilder::new()
|
|
||||||
- .allow_bytes(true)
|
|
||||||
- .unicode(true)
|
|
||||||
+ fn parse(&self) -> Result<Hir> {
|
|
||||||
+ let expr = ParserBuilder::new()
|
|
||||||
+ .allow_invalid_utf8(true)
|
|
||||||
.case_insensitive(self.is_case_insensitive()?)
|
|
||||||
+ .multi_line(true)
|
|
||||||
+ .build()
|
|
||||||
.parse(&self.pattern)?;
|
|
||||||
+ debug!("original regex HIR pattern:\n{}", expr);
|
|
||||||
let expr = nonl::remove(expr, self.opts.line_terminator)?;
|
|
||||||
- debug!("regex ast:\n{:#?}", expr);
|
|
||||||
+ debug!("transformed regex HIR pattern:\n{}", expr);
|
|
||||||
Ok(expr)
|
|
||||||
}
|
|
||||||
|
|
||||||
diff --git a/src/word_boundary.rs b/src/word_boundary.rs
|
|
||||||
index 6df5c6574933..8e6b86d12df8 100644
|
|
||||||
--- a/src/word_boundary.rs
|
|
||||||
+++ b/src/word_boundary.rs
|
|
||||||
@@ -1,4 +1,4 @@
|
|
||||||
-use syntax::Expr;
|
|
||||||
+use syntax::hir::{self, Hir, HirKind};
|
|
||||||
|
|
||||||
/// Strips Unicode word boundaries from the given expression.
|
|
||||||
///
|
|
||||||
@@ -8,7 +8,7 @@ use syntax::Expr;
|
|
||||||
/// false negatives.
|
|
||||||
///
|
|
||||||
/// If no word boundaries could be stripped, then None is returned.
|
|
||||||
-pub fn strip_unicode_word_boundaries(expr: &Expr) -> Option<Expr> {
|
|
||||||
+pub fn strip_unicode_word_boundaries(expr: &Hir) -> Option<Hir> {
|
|
||||||
// The real reason we do this is because Unicode word boundaries are the
|
|
||||||
// one thing that Rust's regex DFA engine can't handle. When it sees a
|
|
||||||
// Unicode word boundary among non-ASCII text, it falls back to one of the
|
|
||||||
@@ -16,23 +16,24 @@ pub fn strip_unicode_word_boundaries(expr: &Expr) -> Option<Expr> {
|
|
||||||
// a regex to find candidate matches without a Unicode word boundary. We'll
|
|
||||||
// only then use the full (and slower) regex to confirm a candidate as a
|
|
||||||
// match or not during search.
|
|
||||||
- use syntax::Expr::*;
|
|
||||||
-
|
|
||||||
- match *expr {
|
|
||||||
- Concat(ref es) if !es.is_empty() => {
|
|
||||||
+ //
|
|
||||||
+ // It looks like we only check the outer edges for `\b`? I guess this is
|
|
||||||
+ // an attempt to optimize for the `-w/--word-regexp` flag? ---AG
|
|
||||||
+ match *expr.kind() {
|
|
||||||
+ HirKind::Concat(ref es) if !es.is_empty() => {
|
|
||||||
let first = is_unicode_word_boundary(&es[0]);
|
|
||||||
let last = is_unicode_word_boundary(es.last().unwrap());
|
|
||||||
// Be careful not to strip word boundaries if there are no other
|
|
||||||
// expressions to match.
|
|
||||||
match (first, last) {
|
|
||||||
(true, false) if es.len() > 1 => {
|
|
||||||
- Some(Concat(es[1..].to_vec()))
|
|
||||||
+ Some(Hir::concat(es[1..].to_vec()))
|
|
||||||
}
|
|
||||||
(false, true) if es.len() > 1 => {
|
|
||||||
- Some(Concat(es[..es.len() - 1].to_vec()))
|
|
||||||
+ Some(Hir::concat(es[..es.len() - 1].to_vec()))
|
|
||||||
}
|
|
||||||
(true, true) if es.len() > 2 => {
|
|
||||||
- Some(Concat(es[1..es.len() - 1].to_vec()))
|
|
||||||
+ Some(Hir::concat(es[1..es.len() - 1].to_vec()))
|
|
||||||
}
|
|
||||||
_ => None,
|
|
||||||
}
|
|
||||||
@@ -42,13 +43,11 @@ pub fn strip_unicode_word_boundaries(expr: &Expr) -> Option<Expr> {
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns true if the given expression is a Unicode word boundary.
|
|
||||||
-fn is_unicode_word_boundary(expr: &Expr) -> bool {
|
|
||||||
- use syntax::Expr::*;
|
|
||||||
-
|
|
||||||
- match *expr {
|
|
||||||
- WordBoundary => true,
|
|
||||||
- NotWordBoundary => true,
|
|
||||||
- Group { ref e, .. } => is_unicode_word_boundary(e),
|
|
||||||
+fn is_unicode_word_boundary(expr: &Hir) -> bool {
|
|
||||||
+ match *expr.kind() {
|
|
||||||
+ HirKind::WordBoundary(hir::WordBoundary::Unicode) => true,
|
|
||||||
+ HirKind::WordBoundary(hir::WordBoundary::UnicodeNegate) => true,
|
|
||||||
+ HirKind::Group(ref x) => is_unicode_word_boundary(&x.hir),
|
|
||||||
_ => false,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
--
|
|
||||||
2.14.3
|
|
||||||
|
|
@ -1,328 +0,0 @@
|
|||||||
From 42b8132d0ad1918c1c0dc677015d87c12819fa26 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Andrew Gallant <jamslam@gmail.com>
|
|
||||||
Date: Tue, 13 Mar 2018 21:43:23 -0400
|
|
||||||
Subject: [PATCH 2/2] grep: add "perfect" smart case detection
|
|
||||||
|
|
||||||
This commit removes the previous smart case detection logic and replaces
|
|
||||||
it with detection based on the regex AST. This particular AST is a faithful
|
|
||||||
representation of the concrete syntax, which lets us be very precise in
|
|
||||||
how we handle it.
|
|
||||||
|
|
||||||
Closes #851
|
|
||||||
---
|
|
||||||
src/lib.rs | 1 +
|
|
||||||
src/search.rs | 58 +++------------
|
|
||||||
src/smart_case.rs | 191 +++++++++++++++++++++++++++++++++++++++++++++++++
|
|
||||||
3 files changed, 201 insertions(+), 49 deletions(-)
|
|
||||||
create mode 100644 src/smart_case.rs
|
|
||||||
|
|
||||||
diff --git a/src/lib.rs b/src/lib.rs
|
|
||||||
index 3b2f0ebd65d5..023cd64ac36a 100644
|
|
||||||
--- a/src/lib.rs
|
|
||||||
+++ b/src/lib.rs
|
|
||||||
@@ -19,6 +19,7 @@ pub use search::{Grep, GrepBuilder, Iter, Match};
|
|
||||||
mod literals;
|
|
||||||
mod nonl;
|
|
||||||
mod search;
|
|
||||||
+mod smart_case;
|
|
||||||
mod word_boundary;
|
|
||||||
|
|
||||||
/// Result is a convenient type alias that fixes the type of the error to
|
|
||||||
diff --git a/src/search.rs b/src/search.rs
|
|
||||||
index 1d5d7e29cccf..49ddf1f875c4 100644
|
|
||||||
--- a/src/search.rs
|
|
||||||
+++ b/src/search.rs
|
|
||||||
@@ -1,10 +1,11 @@
|
|
||||||
use memchr::{memchr, memrchr};
|
|
||||||
+use syntax::ParserBuilder;
|
|
||||||
+use syntax::hir::Hir;
|
|
||||||
use regex::bytes::{Regex, RegexBuilder};
|
|
||||||
|
|
||||||
use literals::LiteralSets;
|
|
||||||
use nonl;
|
|
||||||
-use syntax::ParserBuilder;
|
|
||||||
-use syntax::hir::Hir;
|
|
||||||
+use smart_case::Cased;
|
|
||||||
use word_boundary::strip_unicode_word_boundaries;
|
|
||||||
use Result;
|
|
||||||
|
|
||||||
@@ -205,7 +206,11 @@ impl GrepBuilder {
|
|
||||||
if !self.opts.case_smart {
|
|
||||||
return Ok(false);
|
|
||||||
}
|
|
||||||
- Ok(!has_uppercase_literal(&self.pattern))
|
|
||||||
+ let cased = match Cased::from_pattern(&self.pattern) {
|
|
||||||
+ None => return Ok(false),
|
|
||||||
+ Some(cased) => cased,
|
|
||||||
+ };
|
|
||||||
+ Ok(cased.any_literal && !cased.any_uppercase)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@@ -311,44 +316,15 @@ impl<'b, 's> Iterator for Iter<'b, 's> {
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
-/// Determine whether the pattern contains an uppercase character which should
|
|
||||||
-/// negate the effect of the smart-case option.
|
|
||||||
-///
|
|
||||||
-/// Ideally we would be able to check the AST in order to correctly handle
|
|
||||||
-/// things like '\p{Ll}' and '\p{Lu}' (which should be treated as explicitly
|
|
||||||
-/// cased), but we don't currently have that option. For now, our 'good enough'
|
|
||||||
-/// solution is to simply perform a semi-naïve scan of the input pattern and
|
|
||||||
-/// ignore all characters following a '\'. The ExprBuilder will handle any
|
|
||||||
-/// actual errors, and this at least lets us support the most common cases,
|
|
||||||
-/// like 'foo\w' and 'foo\S', in an intuitive manner.
|
|
||||||
-fn has_uppercase_literal(pattern: &str) -> bool {
|
|
||||||
- let mut chars = pattern.chars();
|
|
||||||
- while let Some(c) = chars.next() {
|
|
||||||
- if c == '\\' {
|
|
||||||
- chars.next();
|
|
||||||
- } else if c.is_uppercase() {
|
|
||||||
- return true;
|
|
||||||
- }
|
|
||||||
- }
|
|
||||||
- false
|
|
||||||
-}
|
|
||||||
-
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
- #![allow(unused_imports)]
|
|
||||||
-
|
|
||||||
use memchr::{memchr, memrchr};
|
|
||||||
use regex::bytes::Regex;
|
|
||||||
|
|
||||||
- use super::{GrepBuilder, Match, has_uppercase_literal};
|
|
||||||
+ use super::{GrepBuilder, Match};
|
|
||||||
|
|
||||||
static SHERLOCK: &'static [u8] = include_bytes!("./data/sherlock.txt");
|
|
||||||
|
|
||||||
- #[allow(dead_code)]
|
|
||||||
- fn s(bytes: &[u8]) -> String {
|
|
||||||
- String::from_utf8(bytes.to_vec()).unwrap()
|
|
||||||
- }
|
|
||||||
-
|
|
||||||
fn find_lines(pat: &str, haystack: &[u8]) -> Vec<Match> {
|
|
||||||
let re = Regex::new(pat).unwrap();
|
|
||||||
let mut lines = vec![];
|
|
||||||
@@ -377,20 +353,4 @@ mod tests {
|
|
||||||
assert_eq!(expected.len(), got.len());
|
|
||||||
assert_eq!(expected, got);
|
|
||||||
}
|
|
||||||
-
|
|
||||||
- #[test]
|
|
||||||
- fn pattern_case() {
|
|
||||||
- assert_eq!(has_uppercase_literal(&"".to_string()), false);
|
|
||||||
- assert_eq!(has_uppercase_literal(&"foo".to_string()), false);
|
|
||||||
- assert_eq!(has_uppercase_literal(&"Foo".to_string()), true);
|
|
||||||
- assert_eq!(has_uppercase_literal(&"foO".to_string()), true);
|
|
||||||
- assert_eq!(has_uppercase_literal(&"foo\\\\".to_string()), false);
|
|
||||||
- assert_eq!(has_uppercase_literal(&"foo\\w".to_string()), false);
|
|
||||||
- assert_eq!(has_uppercase_literal(&"foo\\S".to_string()), false);
|
|
||||||
- assert_eq!(has_uppercase_literal(&"foo\\p{Ll}".to_string()), true);
|
|
||||||
- assert_eq!(has_uppercase_literal(&"foo[a-z]".to_string()), false);
|
|
||||||
- assert_eq!(has_uppercase_literal(&"foo[A-Z]".to_string()), true);
|
|
||||||
- assert_eq!(has_uppercase_literal(&"foo[\\S\\t]".to_string()), false);
|
|
||||||
- assert_eq!(has_uppercase_literal(&"foo\\\\S".to_string()), true);
|
|
||||||
- }
|
|
||||||
}
|
|
||||||
diff --git a/src/smart_case.rs b/src/smart_case.rs
|
|
||||||
new file mode 100644
|
|
||||||
index 000000000000..1379b32620bc
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/src/smart_case.rs
|
|
||||||
@@ -0,0 +1,191 @@
|
|
||||||
+use syntax::ast::{self, Ast};
|
|
||||||
+use syntax::ast::parse::Parser;
|
|
||||||
+
|
|
||||||
+/// The results of analyzing a regex for cased literals.
|
|
||||||
+#[derive(Clone, Debug, Default)]
|
|
||||||
+pub struct Cased {
|
|
||||||
+ /// True if and only if a literal uppercase character occurs in the regex.
|
|
||||||
+ ///
|
|
||||||
+ /// A regex like `\pL` contains no uppercase literals, even though `L`
|
|
||||||
+ /// is uppercase and the `\pL` class contains uppercase characters.
|
|
||||||
+ pub any_uppercase: bool,
|
|
||||||
+ /// True if and only if the regex contains any literal at all. A regex like
|
|
||||||
+ /// `\pL` has this set to false.
|
|
||||||
+ pub any_literal: bool,
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+impl Cased {
|
|
||||||
+ /// Returns a `Cased` value by doing analysis on the AST of `pattern`.
|
|
||||||
+ ///
|
|
||||||
+ /// If `pattern` is not a valid regular expression, then `None` is
|
|
||||||
+ /// returned.
|
|
||||||
+ pub fn from_pattern(pattern: &str) -> Option<Cased> {
|
|
||||||
+ Parser::new()
|
|
||||||
+ .parse(pattern)
|
|
||||||
+ .map(|ast| Cased::from_ast(&ast))
|
|
||||||
+ .ok()
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ fn from_ast(ast: &Ast) -> Cased {
|
|
||||||
+ let mut cased = Cased::default();
|
|
||||||
+ cased.from_ast_impl(ast);
|
|
||||||
+ cased
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ fn from_ast_impl(&mut self, ast: &Ast) {
|
|
||||||
+ if self.done() {
|
|
||||||
+ return;
|
|
||||||
+ }
|
|
||||||
+ match *ast {
|
|
||||||
+ Ast::Empty(_)
|
|
||||||
+ | Ast::Flags(_)
|
|
||||||
+ | Ast::Dot(_)
|
|
||||||
+ | Ast::Assertion(_)
|
|
||||||
+ | Ast::Class(ast::Class::Unicode(_))
|
|
||||||
+ | Ast::Class(ast::Class::Perl(_)) => {}
|
|
||||||
+ Ast::Literal(ref x) => {
|
|
||||||
+ self.from_ast_literal(x);
|
|
||||||
+ }
|
|
||||||
+ Ast::Class(ast::Class::Bracketed(ref x)) => {
|
|
||||||
+ self.from_ast_class_set(&x.kind);
|
|
||||||
+ }
|
|
||||||
+ Ast::Repetition(ref x) => {
|
|
||||||
+ self.from_ast_impl(&x.ast);
|
|
||||||
+ }
|
|
||||||
+ Ast::Group(ref x) => {
|
|
||||||
+ self.from_ast_impl(&x.ast);
|
|
||||||
+ }
|
|
||||||
+ Ast::Alternation(ref alt) => {
|
|
||||||
+ for x in &alt.asts {
|
|
||||||
+ self.from_ast_impl(x);
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+ Ast::Concat(ref alt) => {
|
|
||||||
+ for x in &alt.asts {
|
|
||||||
+ self.from_ast_impl(x);
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ fn from_ast_class_set(&mut self, ast: &ast::ClassSet) {
|
|
||||||
+ if self.done() {
|
|
||||||
+ return;
|
|
||||||
+ }
|
|
||||||
+ match *ast {
|
|
||||||
+ ast::ClassSet::Item(ref item) => {
|
|
||||||
+ self.from_ast_class_set_item(item);
|
|
||||||
+ }
|
|
||||||
+ ast::ClassSet::BinaryOp(ref x) => {
|
|
||||||
+ self.from_ast_class_set(&x.lhs);
|
|
||||||
+ self.from_ast_class_set(&x.rhs);
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ fn from_ast_class_set_item(&mut self, ast: &ast::ClassSetItem) {
|
|
||||||
+ if self.done() {
|
|
||||||
+ return;
|
|
||||||
+ }
|
|
||||||
+ match *ast {
|
|
||||||
+ ast::ClassSetItem::Empty(_)
|
|
||||||
+ | ast::ClassSetItem::Ascii(_)
|
|
||||||
+ | ast::ClassSetItem::Unicode(_)
|
|
||||||
+ | ast::ClassSetItem::Perl(_) => {}
|
|
||||||
+ ast::ClassSetItem::Literal(ref x) => {
|
|
||||||
+ self.from_ast_literal(x);
|
|
||||||
+ }
|
|
||||||
+ ast::ClassSetItem::Range(ref x) => {
|
|
||||||
+ self.from_ast_literal(&x.start);
|
|
||||||
+ self.from_ast_literal(&x.end);
|
|
||||||
+ }
|
|
||||||
+ ast::ClassSetItem::Bracketed(ref x) => {
|
|
||||||
+ self.from_ast_class_set(&x.kind);
|
|
||||||
+ }
|
|
||||||
+ ast::ClassSetItem::Union(ref union) => {
|
|
||||||
+ for x in &union.items {
|
|
||||||
+ self.from_ast_class_set_item(x);
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ fn from_ast_literal(&mut self, ast: &ast::Literal) {
|
|
||||||
+ self.any_literal = true;
|
|
||||||
+ self.any_uppercase = self.any_uppercase || ast.c.is_uppercase();
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ /// Returns true if and only if the attributes can never change no matter
|
|
||||||
+ /// what other AST it might see.
|
|
||||||
+ fn done(&self) -> bool {
|
|
||||||
+ self.any_uppercase && self.any_literal
|
|
||||||
+ }
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+#[cfg(test)]
|
|
||||||
+mod tests {
|
|
||||||
+ use super::*;
|
|
||||||
+
|
|
||||||
+ fn cased(pattern: &str) -> Cased {
|
|
||||||
+ Cased::from_pattern(pattern).unwrap()
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ #[test]
|
|
||||||
+ fn various() {
|
|
||||||
+ let x = cased("");
|
|
||||||
+ assert!(!x.any_uppercase);
|
|
||||||
+ assert!(!x.any_literal);
|
|
||||||
+
|
|
||||||
+ let x = cased("foo");
|
|
||||||
+ assert!(!x.any_uppercase);
|
|
||||||
+ assert!(x.any_literal);
|
|
||||||
+
|
|
||||||
+ let x = cased("Foo");
|
|
||||||
+ assert!(x.any_uppercase);
|
|
||||||
+ assert!(x.any_literal);
|
|
||||||
+
|
|
||||||
+ let x = cased("foO");
|
|
||||||
+ assert!(x.any_uppercase);
|
|
||||||
+ assert!(x.any_literal);
|
|
||||||
+
|
|
||||||
+ let x = cased(r"foo\\");
|
|
||||||
+ assert!(!x.any_uppercase);
|
|
||||||
+ assert!(x.any_literal);
|
|
||||||
+
|
|
||||||
+ let x = cased(r"foo\w");
|
|
||||||
+ assert!(!x.any_uppercase);
|
|
||||||
+ assert!(x.any_literal);
|
|
||||||
+
|
|
||||||
+ let x = cased(r"foo\S");
|
|
||||||
+ assert!(!x.any_uppercase);
|
|
||||||
+ assert!(x.any_literal);
|
|
||||||
+
|
|
||||||
+ let x = cased(r"foo\p{Ll}");
|
|
||||||
+ assert!(!x.any_uppercase);
|
|
||||||
+ assert!(x.any_literal);
|
|
||||||
+
|
|
||||||
+ let x = cased(r"foo[a-z]");
|
|
||||||
+ assert!(!x.any_uppercase);
|
|
||||||
+ assert!(x.any_literal);
|
|
||||||
+
|
|
||||||
+ let x = cased(r"foo[A-Z]");
|
|
||||||
+ assert!(x.any_uppercase);
|
|
||||||
+ assert!(x.any_literal);
|
|
||||||
+
|
|
||||||
+ let x = cased(r"foo[\S\t]");
|
|
||||||
+ assert!(!x.any_uppercase);
|
|
||||||
+ assert!(x.any_literal);
|
|
||||||
+
|
|
||||||
+ let x = cased(r"foo\\S");
|
|
||||||
+ assert!(x.any_uppercase);
|
|
||||||
+ assert!(x.any_literal);
|
|
||||||
+
|
|
||||||
+ let x = cased(r"\p{Ll}");
|
|
||||||
+ assert!(!x.any_uppercase);
|
|
||||||
+ assert!(!x.any_literal);
|
|
||||||
+
|
|
||||||
+ let x = cased(r"aBc\w");
|
|
||||||
+ assert!(x.any_uppercase);
|
|
||||||
+ assert!(x.any_literal);
|
|
||||||
+ }
|
|
||||||
+}
|
|
||||||
--
|
|
||||||
2.14.3
|
|
||||||
|
|
@ -1,12 +0,0 @@
|
|||||||
--- grep-0.1.8/Cargo.toml 1970-01-01T01:00:00+01:00
|
|
||||||
+++ grep-0.1.8/Cargo.toml 2018-06-12T09:25:27.056693+02:00
|
|
||||||
@@ -28,7 +28,7 @@
|
|
||||||
version = "2"
|
|
||||||
|
|
||||||
[dependencies.regex]
|
|
||||||
-version = "0.2.1"
|
|
||||||
+version = "1"
|
|
||||||
|
|
||||||
[dependencies.regex-syntax]
|
|
||||||
-version = "0.4.0"
|
|
||||||
+version = "0.6"
|
|
@ -1 +1 @@
|
|||||||
SHA512 (grep-0.1.8.crate) = d0a4e9dd51402ed8036aa5d2cfcdb184538d2bf0c81baac75c73e80d05945a63cb08a931d87fa526695af93271cecaed6650a045904ceaa05ed6e06fd23e537b
|
SHA512 (grep-0.1.9.crate) = 1ce3c1c1faf2d001bdafc6e021ec3523e08cfaec611bbaf70492cf0038578c2e738ead0df826229fbc0f5e54ff3f9f9cf928be9223bb53907f12b6f468545ef3
|
||||||
|
Loading…
Reference in new issue