You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
388 lines
14 KiB
388 lines
14 KiB
From 9d703110cfe01782d2d0b03a340f5983da215e68 Mon Sep 17 00:00:00 2001
|
|
From: Andrew Gallant <jamslam@gmail.com>
|
|
Date: Sat, 26 Jan 2019 12:25:21 -0500
|
|
Subject: [PATCH] regex: make CRLF hack more robust
|
|
|
|
This commit improves the CRLF hack to be more robust. In particular, in
|
|
addition to rewriting `$` as `(?:\r??$)`, we now strip `\r` from the end
|
|
of a match if and only if the regex has an ending line anchor required for
|
|
a match. This doesn't quite make the hack 100% correct, but should fix most
|
|
use cases in practice. An example of a regex that will still be incorrect
|
|
is `foo|bar$`, since the analysis isn't quite sophisticated enough to
|
|
determine that a `\r` can be safely stripped from any match. Even if we
|
|
fix that, regexes like `foo\r|bar$` still won't be handled correctly. Alas,
|
|
more work on this front should really be focused on enabling this in the
|
|
regex engine itself.
|
|
|
|
The specific cause of this bug was that grep-searcher was sneakily
|
|
stripping CRLF from matching lines when it really shouldn't have. We remove
|
|
that code now, and instead rely on better match semantics provided at a
|
|
lower level.
|
|
|
|
Fixes #1095
|
|
---
|
|
src/config.rs | 8 ++++
|
|
src/crlf.rs | 100 +++++++++++++++++++++++++++++++++++++++++++++++++
|
|
src/matcher.rs | 57 ++++++++++++++++++++++++++--
|
|
3 files changed, 162 insertions(+), 3 deletions(-)
|
|
|
|
diff --git a/src/config.rs b/src/config.rs
|
|
index f3d1f1c..efed9d4 100644
|
|
--- a/src/config.rs
|
|
+++ b/src/config.rs
|
|
@@ -160,6 +160,14 @@ impl ConfiguredHIR {
|
|
non_matching_bytes(&self.expr)
|
|
}
|
|
|
|
+ /// Returns true if and only if this regex needs to have its match offsets
|
|
+ /// tweaked because of CRLF support. Specifically, this occurs when the
|
|
+ /// CRLF hack is enabled and the regex is line anchored at the end. In
|
|
+ /// this case, matches that end with a `\r` have the `\r` stripped.
|
|
+ pub fn needs_crlf_stripped(&self) -> bool {
|
|
+ self.config.crlf && self.expr.is_line_anchored_end()
|
|
+ }
|
|
+
|
|
/// Builds a regular expression from this HIR expression.
|
|
pub fn regex(&self) -> Result<Regex, Error> {
|
|
self.pattern_to_regex(&self.expr.to_string())
|
|
diff --git a/src/crlf.rs b/src/crlf.rs
|
|
index ff6b15b..838a28f 100644
|
|
--- a/src/crlf.rs
|
|
+++ b/src/crlf.rs
|
|
@@ -1,5 +1,105 @@
|
|
+use std::collections::HashMap;
|
|
+
|
|
+use grep_matcher::{Match, Matcher, NoError};
|
|
+use regex::bytes::Regex;
|
|
use regex_syntax::hir::{self, Hir, HirKind};
|
|
|
|
+use config::ConfiguredHIR;
|
|
+use error::Error;
|
|
+use matcher::RegexCaptures;
|
|
+
|
|
+/// A matcher for implementing "word match" semantics.
|
|
+#[derive(Clone, Debug)]
|
|
+pub struct CRLFMatcher {
|
|
+ /// The regex.
|
|
+ regex: Regex,
|
|
+ /// A map from capture group name to capture group index.
|
|
+ names: HashMap<String, usize>,
|
|
+}
|
|
+
|
|
+impl CRLFMatcher {
|
|
+ /// Create a new matcher from the given pattern that strips `\r` from the
|
|
+ /// end of every match.
|
|
+ ///
|
|
+ /// This panics if the given expression doesn't need its CRLF stripped.
|
|
+ pub fn new(expr: &ConfiguredHIR) -> Result<CRLFMatcher, Error> {
|
|
+ assert!(expr.needs_crlf_stripped());
|
|
+
|
|
+ let regex = expr.regex()?;
|
|
+ let mut names = HashMap::new();
|
|
+ for (i, optional_name) in regex.capture_names().enumerate() {
|
|
+ if let Some(name) = optional_name {
|
|
+ names.insert(name.to_string(), i.checked_sub(1).unwrap());
|
|
+ }
|
|
+ }
|
|
+ Ok(CRLFMatcher { regex, names })
|
|
+ }
|
|
+}
|
|
+
|
|
+impl Matcher for CRLFMatcher {
|
|
+ type Captures = RegexCaptures;
|
|
+ type Error = NoError;
|
|
+
|
|
+ fn find_at(
|
|
+ &self,
|
|
+ haystack: &[u8],
|
|
+ at: usize,
|
|
+ ) -> Result<Option<Match>, NoError> {
|
|
+ let m = match self.regex.find_at(haystack, at) {
|
|
+ None => return Ok(None),
|
|
+ Some(m) => Match::new(m.start(), m.end()),
|
|
+ };
|
|
+ Ok(Some(adjust_match(haystack, m)))
|
|
+ }
|
|
+
|
|
+ fn new_captures(&self) -> Result<RegexCaptures, NoError> {
|
|
+ Ok(RegexCaptures::new(self.regex.capture_locations()))
|
|
+ }
|
|
+
|
|
+ fn capture_count(&self) -> usize {
|
|
+ self.regex.captures_len().checked_sub(1).unwrap()
|
|
+ }
|
|
+
|
|
+ fn capture_index(&self, name: &str) -> Option<usize> {
|
|
+ self.names.get(name).map(|i| *i)
|
|
+ }
|
|
+
|
|
+ fn captures_at(
|
|
+ &self,
|
|
+ haystack: &[u8],
|
|
+ at: usize,
|
|
+ caps: &mut RegexCaptures,
|
|
+ ) -> Result<bool, NoError> {
|
|
+ caps.strip_crlf(false);
|
|
+ let r = self.regex.captures_read_at(caps.locations(), haystack, at);
|
|
+ if !r.is_some() {
|
|
+ return Ok(false);
|
|
+ }
|
|
+
|
|
+ // If the end of our match includes a `\r`, then strip it from all
|
|
+ // capture groups ending at the same location.
|
|
+ let end = caps.locations().get(0).unwrap().1;
|
|
+ if end > 0 && haystack.get(end - 1) == Some(&b'\r') {
|
|
+ caps.strip_crlf(true);
|
|
+ }
|
|
+ Ok(true)
|
|
+ }
|
|
+
|
|
+ // We specifically do not implement other methods like find_iter or
|
|
+ // captures_iter. Namely, the iter methods are guaranteed to be correct
|
|
+ // by virtue of implementing find_at and captures_at above.
|
|
+}
|
|
+
|
|
+/// If the given match ends with a `\r`, then return a new match that ends
|
|
+/// immediately before the `\r`.
|
|
+pub fn adjust_match(haystack: &[u8], m: Match) -> Match {
|
|
+ if m.end() > 0 && haystack.get(m.end() - 1) == Some(&b'\r') {
|
|
+ m.with_end(m.end() - 1)
|
|
+ } else {
|
|
+ m
|
|
+ }
|
|
+}
|
|
+
|
|
/// Substitutes all occurrences of multi-line enabled `$` with `(?:\r?$)`.
|
|
///
|
|
/// This does not preserve the exact semantics of the given expression,
|
|
diff --git a/src/matcher.rs b/src/matcher.rs
|
|
index 831d573..b15d824 100644
|
|
--- a/src/matcher.rs
|
|
+++ b/src/matcher.rs
|
|
@@ -6,6 +6,7 @@ use grep_matcher::{
|
|
use regex::bytes::{CaptureLocations, Regex};
|
|
|
|
use config::{Config, ConfiguredHIR};
|
|
+use crlf::CRLFMatcher;
|
|
use error::Error;
|
|
use word::WordMatcher;
|
|
|
|
@@ -344,6 +345,11 @@ impl RegexMatcher {
|
|
enum RegexMatcherImpl {
|
|
/// The standard matcher used for all regular expressions.
|
|
Standard(StandardMatcher),
|
|
+ /// A matcher that strips `\r` from the end of matches.
|
|
+ ///
|
|
+ /// This is only used when the CRLF hack is enabled and the regex is line
|
|
+ /// anchored at the end.
|
|
+ CRLF(CRLFMatcher),
|
|
/// A matcher that only matches at word boundaries. This transforms the
|
|
/// regex to `(^|\W)(...)($|\W)` instead of the more intuitive `\b(...)\b`.
|
|
/// Because of this, the WordMatcher provides its own implementation of
|
|
@@ -358,6 +364,8 @@ impl RegexMatcherImpl {
|
|
fn new(expr: &ConfiguredHIR) -> Result<RegexMatcherImpl, Error> {
|
|
if expr.config().word {
|
|
Ok(RegexMatcherImpl::Word(WordMatcher::new(expr)?))
|
|
+ } else if expr.needs_crlf_stripped() {
|
|
+ Ok(RegexMatcherImpl::CRLF(CRLFMatcher::new(expr)?))
|
|
} else {
|
|
Ok(RegexMatcherImpl::Standard(StandardMatcher::new(expr)?))
|
|
}
|
|
@@ -379,6 +387,7 @@ impl Matcher for RegexMatcher {
|
|
use self::RegexMatcherImpl::*;
|
|
match self.matcher {
|
|
Standard(ref m) => m.find_at(haystack, at),
|
|
+ CRLF(ref m) => m.find_at(haystack, at),
|
|
Word(ref m) => m.find_at(haystack, at),
|
|
}
|
|
}
|
|
@@ -387,6 +396,7 @@ impl Matcher for RegexMatcher {
|
|
use self::RegexMatcherImpl::*;
|
|
match self.matcher {
|
|
Standard(ref m) => m.new_captures(),
|
|
+ CRLF(ref m) => m.new_captures(),
|
|
Word(ref m) => m.new_captures(),
|
|
}
|
|
}
|
|
@@ -395,6 +405,7 @@ impl Matcher for RegexMatcher {
|
|
use self::RegexMatcherImpl::*;
|
|
match self.matcher {
|
|
Standard(ref m) => m.capture_count(),
|
|
+ CRLF(ref m) => m.capture_count(),
|
|
Word(ref m) => m.capture_count(),
|
|
}
|
|
}
|
|
@@ -403,6 +414,7 @@ impl Matcher for RegexMatcher {
|
|
use self::RegexMatcherImpl::*;
|
|
match self.matcher {
|
|
Standard(ref m) => m.capture_index(name),
|
|
+ CRLF(ref m) => m.capture_index(name),
|
|
Word(ref m) => m.capture_index(name),
|
|
}
|
|
}
|
|
@@ -411,6 +423,7 @@ impl Matcher for RegexMatcher {
|
|
use self::RegexMatcherImpl::*;
|
|
match self.matcher {
|
|
Standard(ref m) => m.find(haystack),
|
|
+ CRLF(ref m) => m.find(haystack),
|
|
Word(ref m) => m.find(haystack),
|
|
}
|
|
}
|
|
@@ -425,6 +438,7 @@ impl Matcher for RegexMatcher {
|
|
use self::RegexMatcherImpl::*;
|
|
match self.matcher {
|
|
Standard(ref m) => m.find_iter(haystack, matched),
|
|
+ CRLF(ref m) => m.find_iter(haystack, matched),
|
|
Word(ref m) => m.find_iter(haystack, matched),
|
|
}
|
|
}
|
|
@@ -439,6 +453,7 @@ impl Matcher for RegexMatcher {
|
|
use self::RegexMatcherImpl::*;
|
|
match self.matcher {
|
|
Standard(ref m) => m.try_find_iter(haystack, matched),
|
|
+ CRLF(ref m) => m.try_find_iter(haystack, matched),
|
|
Word(ref m) => m.try_find_iter(haystack, matched),
|
|
}
|
|
}
|
|
@@ -451,6 +466,7 @@ impl Matcher for RegexMatcher {
|
|
use self::RegexMatcherImpl::*;
|
|
match self.matcher {
|
|
Standard(ref m) => m.captures(haystack, caps),
|
|
+ CRLF(ref m) => m.captures(haystack, caps),
|
|
Word(ref m) => m.captures(haystack, caps),
|
|
}
|
|
}
|
|
@@ -466,6 +482,7 @@ impl Matcher for RegexMatcher {
|
|
use self::RegexMatcherImpl::*;
|
|
match self.matcher {
|
|
Standard(ref m) => m.captures_iter(haystack, caps, matched),
|
|
+ CRLF(ref m) => m.captures_iter(haystack, caps, matched),
|
|
Word(ref m) => m.captures_iter(haystack, caps, matched),
|
|
}
|
|
}
|
|
@@ -481,6 +498,7 @@ impl Matcher for RegexMatcher {
|
|
use self::RegexMatcherImpl::*;
|
|
match self.matcher {
|
|
Standard(ref m) => m.try_captures_iter(haystack, caps, matched),
|
|
+ CRLF(ref m) => m.try_captures_iter(haystack, caps, matched),
|
|
Word(ref m) => m.try_captures_iter(haystack, caps, matched),
|
|
}
|
|
}
|
|
@@ -494,6 +512,7 @@ impl Matcher for RegexMatcher {
|
|
use self::RegexMatcherImpl::*;
|
|
match self.matcher {
|
|
Standard(ref m) => m.captures_at(haystack, at, caps),
|
|
+ CRLF(ref m) => m.captures_at(haystack, at, caps),
|
|
Word(ref m) => m.captures_at(haystack, at, caps),
|
|
}
|
|
}
|
|
@@ -509,6 +528,7 @@ impl Matcher for RegexMatcher {
|
|
use self::RegexMatcherImpl::*;
|
|
match self.matcher {
|
|
Standard(ref m) => m.replace(haystack, dst, append),
|
|
+ CRLF(ref m) => m.replace(haystack, dst, append),
|
|
Word(ref m) => m.replace(haystack, dst, append),
|
|
}
|
|
}
|
|
@@ -527,6 +547,9 @@ impl Matcher for RegexMatcher {
|
|
Standard(ref m) => {
|
|
m.replace_with_captures(haystack, caps, dst, append)
|
|
}
|
|
+ CRLF(ref m) => {
|
|
+ m.replace_with_captures(haystack, caps, dst, append)
|
|
+ }
|
|
Word(ref m) => {
|
|
m.replace_with_captures(haystack, caps, dst, append)
|
|
}
|
|
@@ -537,6 +560,7 @@ impl Matcher for RegexMatcher {
|
|
use self::RegexMatcherImpl::*;
|
|
match self.matcher {
|
|
Standard(ref m) => m.is_match(haystack),
|
|
+ CRLF(ref m) => m.is_match(haystack),
|
|
Word(ref m) => m.is_match(haystack),
|
|
}
|
|
}
|
|
@@ -549,6 +573,7 @@ impl Matcher for RegexMatcher {
|
|
use self::RegexMatcherImpl::*;
|
|
match self.matcher {
|
|
Standard(ref m) => m.is_match_at(haystack, at),
|
|
+ CRLF(ref m) => m.is_match_at(haystack, at),
|
|
Word(ref m) => m.is_match_at(haystack, at),
|
|
}
|
|
}
|
|
@@ -560,6 +585,7 @@ impl Matcher for RegexMatcher {
|
|
use self::RegexMatcherImpl::*;
|
|
match self.matcher {
|
|
Standard(ref m) => m.shortest_match(haystack),
|
|
+ CRLF(ref m) => m.shortest_match(haystack),
|
|
Word(ref m) => m.shortest_match(haystack),
|
|
}
|
|
}
|
|
@@ -572,6 +598,7 @@ impl Matcher for RegexMatcher {
|
|
use self::RegexMatcherImpl::*;
|
|
match self.matcher {
|
|
Standard(ref m) => m.shortest_match_at(haystack, at),
|
|
+ CRLF(ref m) => m.shortest_match_at(haystack, at),
|
|
Word(ref m) => m.shortest_match_at(haystack, at),
|
|
}
|
|
}
|
|
@@ -712,6 +739,9 @@ pub struct RegexCaptures {
|
|
/// and the capturing groups must behave as if `(re)` is the `0`th capture
|
|
/// group.
|
|
offset: usize,
|
|
+ /// When enable, the end of a match has `\r` stripped from it, if one
|
|
+ /// exists.
|
|
+ strip_crlf: bool,
|
|
}
|
|
|
|
impl Captures for RegexCaptures {
|
|
@@ -720,8 +750,25 @@ impl Captures for RegexCaptures {
|
|
}
|
|
|
|
fn get(&self, i: usize) -> Option<Match> {
|
|
- let actual = i.checked_add(self.offset).unwrap();
|
|
- self.locs.pos(actual).map(|(s, e)| Match::new(s, e))
|
|
+ if !self.strip_crlf {
|
|
+ let actual = i.checked_add(self.offset).unwrap();
|
|
+ return self.locs.pos(actual).map(|(s, e)| Match::new(s, e));
|
|
+ }
|
|
+
|
|
+ // currently don't support capture offsetting with CRLF stripping
|
|
+ assert_eq!(self.offset, 0);
|
|
+ let m = match self.locs.pos(i).map(|(s, e)| Match::new(s, e)) {
|
|
+ None => return None,
|
|
+ Some(m) => m,
|
|
+ };
|
|
+ // If the end position of this match corresponds to the end position
|
|
+ // of the overall match, then we apply our CRLF stripping. Otherwise,
|
|
+ // we cannot assume stripping is correct.
|
|
+ if i == 0 || m.end() == self.locs.pos(0).unwrap().1 {
|
|
+ Some(m.with_end(m.end() - 1))
|
|
+ } else {
|
|
+ Some(m)
|
|
+ }
|
|
}
|
|
}
|
|
|
|
@@ -734,12 +781,16 @@ impl RegexCaptures {
|
|
locs: CaptureLocations,
|
|
offset: usize,
|
|
) -> RegexCaptures {
|
|
- RegexCaptures { locs, offset }
|
|
+ RegexCaptures { locs, offset, strip_crlf: false }
|
|
}
|
|
|
|
pub(crate) fn locations(&mut self) -> &mut CaptureLocations {
|
|
&mut self.locs
|
|
}
|
|
+
|
|
+ pub(crate) fn strip_crlf(&mut self, yes: bool) {
|
|
+ self.strip_crlf = yes;
|
|
+ }
|
|
}
|
|
|
|
#[cfg(test)]
|
|
--
|
|
2.20.1
|
|
|