parent
bf84d0ad86
commit
00bd1f2f1d
@ -1 +1,2 @@
|
||||
/grep-regex-0.1.1.crate
|
||||
/grep-regex-0.1.2.crate
|
||||
|
@ -1,387 +0,0 @@
|
||||
From 9d703110cfe01782d2d0b03a340f5983da215e68 Mon Sep 17 00:00:00 2001
|
||||
From: Andrew Gallant <jamslam@gmail.com>
|
||||
Date: Sat, 26 Jan 2019 12:25:21 -0500
|
||||
Subject: [PATCH] regex: make CRLF hack more robust
|
||||
|
||||
This commit improves the CRLF hack to be more robust. In particular, in
|
||||
addition to rewriting `$` as `(?:\r??$)`, we now strip `\r` from the end
|
||||
of a match if and only if the regex has an ending line anchor required for
|
||||
a match. This doesn't quite make the hack 100% correct, but should fix most
|
||||
use cases in practice. An example of a regex that will still be incorrect
|
||||
is `foo|bar$`, since the analysis isn't quite sophisticated enough to
|
||||
determine that a `\r` can be safely stripped from any match. Even if we
|
||||
fix that, regexes like `foo\r|bar$` still won't be handled correctly. Alas,
|
||||
more work on this front should really be focused on enabling this in the
|
||||
regex engine itself.
|
||||
|
||||
The specific cause of this bug was that grep-searcher was sneakily
|
||||
stripping CRLF from matching lines when it really shouldn't have. We remove
|
||||
that code now, and instead rely on better match semantics provided at a
|
||||
lower level.
|
||||
|
||||
Fixes #1095
|
||||
---
|
||||
src/config.rs | 8 ++++
|
||||
src/crlf.rs | 100 +++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
src/matcher.rs | 57 ++++++++++++++++++++++++++--
|
||||
3 files changed, 162 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/src/config.rs b/src/config.rs
|
||||
index f3d1f1c..efed9d4 100644
|
||||
--- a/src/config.rs
|
||||
+++ b/src/config.rs
|
||||
@@ -160,6 +160,14 @@ impl ConfiguredHIR {
|
||||
non_matching_bytes(&self.expr)
|
||||
}
|
||||
|
||||
+ /// Returns true if and only if this regex needs to have its match offsets
|
||||
+ /// tweaked because of CRLF support. Specifically, this occurs when the
|
||||
+ /// CRLF hack is enabled and the regex is line anchored at the end. In
|
||||
+ /// this case, matches that end with a `\r` have the `\r` stripped.
|
||||
+ pub fn needs_crlf_stripped(&self) -> bool {
|
||||
+ self.config.crlf && self.expr.is_line_anchored_end()
|
||||
+ }
|
||||
+
|
||||
/// Builds a regular expression from this HIR expression.
|
||||
pub fn regex(&self) -> Result<Regex, Error> {
|
||||
self.pattern_to_regex(&self.expr.to_string())
|
||||
diff --git a/src/crlf.rs b/src/crlf.rs
|
||||
index ff6b15b..838a28f 100644
|
||||
--- a/src/crlf.rs
|
||||
+++ b/src/crlf.rs
|
||||
@@ -1,5 +1,105 @@
|
||||
+use std::collections::HashMap;
|
||||
+
|
||||
+use grep_matcher::{Match, Matcher, NoError};
|
||||
+use regex::bytes::Regex;
|
||||
use regex_syntax::hir::{self, Hir, HirKind};
|
||||
|
||||
+use config::ConfiguredHIR;
|
||||
+use error::Error;
|
||||
+use matcher::RegexCaptures;
|
||||
+
|
||||
+/// A matcher for implementing "word match" semantics.
|
||||
+#[derive(Clone, Debug)]
|
||||
+pub struct CRLFMatcher {
|
||||
+ /// The regex.
|
||||
+ regex: Regex,
|
||||
+ /// A map from capture group name to capture group index.
|
||||
+ names: HashMap<String, usize>,
|
||||
+}
|
||||
+
|
||||
+impl CRLFMatcher {
|
||||
+ /// Create a new matcher from the given pattern that strips `\r` from the
|
||||
+ /// end of every match.
|
||||
+ ///
|
||||
+ /// This panics if the given expression doesn't need its CRLF stripped.
|
||||
+ pub fn new(expr: &ConfiguredHIR) -> Result<CRLFMatcher, Error> {
|
||||
+ assert!(expr.needs_crlf_stripped());
|
||||
+
|
||||
+ let regex = expr.regex()?;
|
||||
+ let mut names = HashMap::new();
|
||||
+ for (i, optional_name) in regex.capture_names().enumerate() {
|
||||
+ if let Some(name) = optional_name {
|
||||
+ names.insert(name.to_string(), i.checked_sub(1).unwrap());
|
||||
+ }
|
||||
+ }
|
||||
+ Ok(CRLFMatcher { regex, names })
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+impl Matcher for CRLFMatcher {
|
||||
+ type Captures = RegexCaptures;
|
||||
+ type Error = NoError;
|
||||
+
|
||||
+ fn find_at(
|
||||
+ &self,
|
||||
+ haystack: &[u8],
|
||||
+ at: usize,
|
||||
+ ) -> Result<Option<Match>, NoError> {
|
||||
+ let m = match self.regex.find_at(haystack, at) {
|
||||
+ None => return Ok(None),
|
||||
+ Some(m) => Match::new(m.start(), m.end()),
|
||||
+ };
|
||||
+ Ok(Some(adjust_match(haystack, m)))
|
||||
+ }
|
||||
+
|
||||
+ fn new_captures(&self) -> Result<RegexCaptures, NoError> {
|
||||
+ Ok(RegexCaptures::new(self.regex.capture_locations()))
|
||||
+ }
|
||||
+
|
||||
+ fn capture_count(&self) -> usize {
|
||||
+ self.regex.captures_len().checked_sub(1).unwrap()
|
||||
+ }
|
||||
+
|
||||
+ fn capture_index(&self, name: &str) -> Option<usize> {
|
||||
+ self.names.get(name).map(|i| *i)
|
||||
+ }
|
||||
+
|
||||
+ fn captures_at(
|
||||
+ &self,
|
||||
+ haystack: &[u8],
|
||||
+ at: usize,
|
||||
+ caps: &mut RegexCaptures,
|
||||
+ ) -> Result<bool, NoError> {
|
||||
+ caps.strip_crlf(false);
|
||||
+ let r = self.regex.captures_read_at(caps.locations(), haystack, at);
|
||||
+ if !r.is_some() {
|
||||
+ return Ok(false);
|
||||
+ }
|
||||
+
|
||||
+ // If the end of our match includes a `\r`, then strip it from all
|
||||
+ // capture groups ending at the same location.
|
||||
+ let end = caps.locations().get(0).unwrap().1;
|
||||
+ if end > 0 && haystack.get(end - 1) == Some(&b'\r') {
|
||||
+ caps.strip_crlf(true);
|
||||
+ }
|
||||
+ Ok(true)
|
||||
+ }
|
||||
+
|
||||
+ // We specifically do not implement other methods like find_iter or
|
||||
+ // captures_iter. Namely, the iter methods are guaranteed to be correct
|
||||
+ // by virtue of implementing find_at and captures_at above.
|
||||
+}
|
||||
+
|
||||
+/// If the given match ends with a `\r`, then return a new match that ends
|
||||
+/// immediately before the `\r`.
|
||||
+pub fn adjust_match(haystack: &[u8], m: Match) -> Match {
|
||||
+ if m.end() > 0 && haystack.get(m.end() - 1) == Some(&b'\r') {
|
||||
+ m.with_end(m.end() - 1)
|
||||
+ } else {
|
||||
+ m
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
/// Substitutes all occurrences of multi-line enabled `$` with `(?:\r?$)`.
|
||||
///
|
||||
/// This does not preserve the exact semantics of the given expression,
|
||||
diff --git a/src/matcher.rs b/src/matcher.rs
|
||||
index 831d573..b15d824 100644
|
||||
--- a/src/matcher.rs
|
||||
+++ b/src/matcher.rs
|
||||
@@ -6,6 +6,7 @@ use grep_matcher::{
|
||||
use regex::bytes::{CaptureLocations, Regex};
|
||||
|
||||
use config::{Config, ConfiguredHIR};
|
||||
+use crlf::CRLFMatcher;
|
||||
use error::Error;
|
||||
use word::WordMatcher;
|
||||
|
||||
@@ -344,6 +345,11 @@ impl RegexMatcher {
|
||||
enum RegexMatcherImpl {
|
||||
/// The standard matcher used for all regular expressions.
|
||||
Standard(StandardMatcher),
|
||||
+ /// A matcher that strips `\r` from the end of matches.
|
||||
+ ///
|
||||
+ /// This is only used when the CRLF hack is enabled and the regex is line
|
||||
+ /// anchored at the end.
|
||||
+ CRLF(CRLFMatcher),
|
||||
/// A matcher that only matches at word boundaries. This transforms the
|
||||
/// regex to `(^|\W)(...)($|\W)` instead of the more intuitive `\b(...)\b`.
|
||||
/// Because of this, the WordMatcher provides its own implementation of
|
||||
@@ -358,6 +364,8 @@ impl RegexMatcherImpl {
|
||||
fn new(expr: &ConfiguredHIR) -> Result<RegexMatcherImpl, Error> {
|
||||
if expr.config().word {
|
||||
Ok(RegexMatcherImpl::Word(WordMatcher::new(expr)?))
|
||||
+ } else if expr.needs_crlf_stripped() {
|
||||
+ Ok(RegexMatcherImpl::CRLF(CRLFMatcher::new(expr)?))
|
||||
} else {
|
||||
Ok(RegexMatcherImpl::Standard(StandardMatcher::new(expr)?))
|
||||
}
|
||||
@@ -379,6 +387,7 @@ impl Matcher for RegexMatcher {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.find_at(haystack, at),
|
||||
+ CRLF(ref m) => m.find_at(haystack, at),
|
||||
Word(ref m) => m.find_at(haystack, at),
|
||||
}
|
||||
}
|
||||
@@ -387,6 +396,7 @@ impl Matcher for RegexMatcher {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.new_captures(),
|
||||
+ CRLF(ref m) => m.new_captures(),
|
||||
Word(ref m) => m.new_captures(),
|
||||
}
|
||||
}
|
||||
@@ -395,6 +405,7 @@ impl Matcher for RegexMatcher {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.capture_count(),
|
||||
+ CRLF(ref m) => m.capture_count(),
|
||||
Word(ref m) => m.capture_count(),
|
||||
}
|
||||
}
|
||||
@@ -403,6 +414,7 @@ impl Matcher for RegexMatcher {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.capture_index(name),
|
||||
+ CRLF(ref m) => m.capture_index(name),
|
||||
Word(ref m) => m.capture_index(name),
|
||||
}
|
||||
}
|
||||
@@ -411,6 +423,7 @@ impl Matcher for RegexMatcher {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.find(haystack),
|
||||
+ CRLF(ref m) => m.find(haystack),
|
||||
Word(ref m) => m.find(haystack),
|
||||
}
|
||||
}
|
||||
@@ -425,6 +438,7 @@ impl Matcher for RegexMatcher {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.find_iter(haystack, matched),
|
||||
+ CRLF(ref m) => m.find_iter(haystack, matched),
|
||||
Word(ref m) => m.find_iter(haystack, matched),
|
||||
}
|
||||
}
|
||||
@@ -439,6 +453,7 @@ impl Matcher for RegexMatcher {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.try_find_iter(haystack, matched),
|
||||
+ CRLF(ref m) => m.try_find_iter(haystack, matched),
|
||||
Word(ref m) => m.try_find_iter(haystack, matched),
|
||||
}
|
||||
}
|
||||
@@ -451,6 +466,7 @@ impl Matcher for RegexMatcher {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.captures(haystack, caps),
|
||||
+ CRLF(ref m) => m.captures(haystack, caps),
|
||||
Word(ref m) => m.captures(haystack, caps),
|
||||
}
|
||||
}
|
||||
@@ -466,6 +482,7 @@ impl Matcher for RegexMatcher {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.captures_iter(haystack, caps, matched),
|
||||
+ CRLF(ref m) => m.captures_iter(haystack, caps, matched),
|
||||
Word(ref m) => m.captures_iter(haystack, caps, matched),
|
||||
}
|
||||
}
|
||||
@@ -481,6 +498,7 @@ impl Matcher for RegexMatcher {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.try_captures_iter(haystack, caps, matched),
|
||||
+ CRLF(ref m) => m.try_captures_iter(haystack, caps, matched),
|
||||
Word(ref m) => m.try_captures_iter(haystack, caps, matched),
|
||||
}
|
||||
}
|
||||
@@ -494,6 +512,7 @@ impl Matcher for RegexMatcher {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.captures_at(haystack, at, caps),
|
||||
+ CRLF(ref m) => m.captures_at(haystack, at, caps),
|
||||
Word(ref m) => m.captures_at(haystack, at, caps),
|
||||
}
|
||||
}
|
||||
@@ -509,6 +528,7 @@ impl Matcher for RegexMatcher {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.replace(haystack, dst, append),
|
||||
+ CRLF(ref m) => m.replace(haystack, dst, append),
|
||||
Word(ref m) => m.replace(haystack, dst, append),
|
||||
}
|
||||
}
|
||||
@@ -527,6 +547,9 @@ impl Matcher for RegexMatcher {
|
||||
Standard(ref m) => {
|
||||
m.replace_with_captures(haystack, caps, dst, append)
|
||||
}
|
||||
+ CRLF(ref m) => {
|
||||
+ m.replace_with_captures(haystack, caps, dst, append)
|
||||
+ }
|
||||
Word(ref m) => {
|
||||
m.replace_with_captures(haystack, caps, dst, append)
|
||||
}
|
||||
@@ -537,6 +560,7 @@ impl Matcher for RegexMatcher {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.is_match(haystack),
|
||||
+ CRLF(ref m) => m.is_match(haystack),
|
||||
Word(ref m) => m.is_match(haystack),
|
||||
}
|
||||
}
|
||||
@@ -549,6 +573,7 @@ impl Matcher for RegexMatcher {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.is_match_at(haystack, at),
|
||||
+ CRLF(ref m) => m.is_match_at(haystack, at),
|
||||
Word(ref m) => m.is_match_at(haystack, at),
|
||||
}
|
||||
}
|
||||
@@ -560,6 +585,7 @@ impl Matcher for RegexMatcher {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.shortest_match(haystack),
|
||||
+ CRLF(ref m) => m.shortest_match(haystack),
|
||||
Word(ref m) => m.shortest_match(haystack),
|
||||
}
|
||||
}
|
||||
@@ -572,6 +598,7 @@ impl Matcher for RegexMatcher {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.shortest_match_at(haystack, at),
|
||||
+ CRLF(ref m) => m.shortest_match_at(haystack, at),
|
||||
Word(ref m) => m.shortest_match_at(haystack, at),
|
||||
}
|
||||
}
|
||||
@@ -712,6 +739,9 @@ pub struct RegexCaptures {
|
||||
/// and the capturing groups must behave as if `(re)` is the `0`th capture
|
||||
/// group.
|
||||
offset: usize,
|
||||
+ /// When enable, the end of a match has `\r` stripped from it, if one
|
||||
+ /// exists.
|
||||
+ strip_crlf: bool,
|
||||
}
|
||||
|
||||
impl Captures for RegexCaptures {
|
||||
@@ -720,8 +750,25 @@ impl Captures for RegexCaptures {
|
||||
}
|
||||
|
||||
fn get(&self, i: usize) -> Option<Match> {
|
||||
- let actual = i.checked_add(self.offset).unwrap();
|
||||
- self.locs.pos(actual).map(|(s, e)| Match::new(s, e))
|
||||
+ if !self.strip_crlf {
|
||||
+ let actual = i.checked_add(self.offset).unwrap();
|
||||
+ return self.locs.pos(actual).map(|(s, e)| Match::new(s, e));
|
||||
+ }
|
||||
+
|
||||
+ // currently don't support capture offsetting with CRLF stripping
|
||||
+ assert_eq!(self.offset, 0);
|
||||
+ let m = match self.locs.pos(i).map(|(s, e)| Match::new(s, e)) {
|
||||
+ None => return None,
|
||||
+ Some(m) => m,
|
||||
+ };
|
||||
+ // If the end position of this match corresponds to the end position
|
||||
+ // of the overall match, then we apply our CRLF stripping. Otherwise,
|
||||
+ // we cannot assume stripping is correct.
|
||||
+ if i == 0 || m.end() == self.locs.pos(0).unwrap().1 {
|
||||
+ Some(m.with_end(m.end() - 1))
|
||||
+ } else {
|
||||
+ Some(m)
|
||||
+ }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -734,12 +781,16 @@ impl RegexCaptures {
|
||||
locs: CaptureLocations,
|
||||
offset: usize,
|
||||
) -> RegexCaptures {
|
||||
- RegexCaptures { locs, offset }
|
||||
+ RegexCaptures { locs, offset, strip_crlf: false }
|
||||
}
|
||||
|
||||
pub(crate) fn locations(&mut self) -> &mut CaptureLocations {
|
||||
&mut self.locs
|
||||
}
|
||||
+
|
||||
+ pub(crate) fn strip_crlf(&mut self, yes: bool) {
|
||||
+ self.strip_crlf = yes;
|
||||
+ }
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
--
|
||||
2.20.1
|
||||
|
@ -1,11 +0,0 @@
|
||||
--- grep-regex-0.1.1/Cargo.toml 1970-01-01T01:00:00+01:00
|
||||
+++ grep-regex-0.1.1/Cargo.toml 2019-02-14T09:25:19.720854+01:00
|
||||
@@ -31,7 +31,7 @@
|
||||
version = "1.0.5"
|
||||
|
||||
[dependencies.regex-syntax]
|
||||
-version = "0.6.2"
|
||||
+version = "0.6.5"
|
||||
|
||||
[dependencies.thread_local]
|
||||
version = "0.3.6"
|
@ -1 +1 @@
|
||||
SHA512 (grep-regex-0.1.1.crate) = 281585dca0d8e42778bda7a23780ddef38d31145c36e3ea831b1edb98be66c87bba553303fb973af1211aef21d1deb11d2f9466947fb0622b2d1bf180af01a1f
|
||||
SHA512 (grep-regex-0.1.2.crate) = 4ccaeda895b554969398dd1875e486b1994c7ef539277e5f4f30549b8d65a18228a48a5375ade1638ad7c26fd79d57ef0273e6d1b14dc285d9690bd63572872d
|
||||
|
Loading…
Reference in new issue