diff --git a/0001-regex-make-CRLF-hack-more-robust.patch b/0001-regex-make-CRLF-hack-more-robust.patch new file mode 100644 index 0000000..efaf64e --- /dev/null +++ b/0001-regex-make-CRLF-hack-more-robust.patch @@ -0,0 +1,387 @@ +From 9d703110cfe01782d2d0b03a340f5983da215e68 Mon Sep 17 00:00:00 2001 +From: Andrew Gallant +Date: Sat, 26 Jan 2019 12:25:21 -0500 +Subject: [PATCH] regex: make CRLF hack more robust + +This commit improves the CRLF hack to be more robust. In particular, in +addition to rewriting `$` as `(?:\r??$)`, we now strip `\r` from the end +of a match if and only if the regex has an ending line anchor required for +a match. This doesn't quite make the hack 100% correct, but should fix most +use cases in practice. An example of a regex that will still be incorrect +is `foo|bar$`, since the analysis isn't quite sophisticated enough to +determine that a `\r` can be safely stripped from any match. Even if we +fix that, regexes like `foo\r|bar$` still won't be handled correctly. Alas, +more work on this front should really be focused on enabling this in the +regex engine itself. + +The specific cause of this bug was that grep-searcher was sneakily +stripping CRLF from matching lines when it really shouldn't have. We remove +that code now, and instead rely on better match semantics provided at a +lower level. + +Fixes #1095 +--- + src/config.rs | 8 ++++ + src/crlf.rs | 100 +++++++++++++++++++++++++++++++++++++++++++++++++ + src/matcher.rs | 57 ++++++++++++++++++++++++++-- + 3 files changed, 162 insertions(+), 3 deletions(-) + +diff --git a/src/config.rs b/src/config.rs +index f3d1f1c..efed9d4 100644 +--- a/src/config.rs ++++ b/src/config.rs +@@ -160,6 +160,14 @@ impl ConfiguredHIR { + non_matching_bytes(&self.expr) + } + ++ /// Returns true if and only if this regex needs to have its match offsets ++ /// tweaked because of CRLF support. Specifically, this occurs when the ++ /// CRLF hack is enabled and the regex is line anchored at the end. In ++ /// this case, matches that end with a `\r` have the `\r` stripped. ++ pub fn needs_crlf_stripped(&self) -> bool { ++ self.config.crlf && self.expr.is_line_anchored_end() ++ } ++ + /// Builds a regular expression from this HIR expression. + pub fn regex(&self) -> Result { + self.pattern_to_regex(&self.expr.to_string()) +diff --git a/src/crlf.rs b/src/crlf.rs +index ff6b15b..838a28f 100644 +--- a/src/crlf.rs ++++ b/src/crlf.rs +@@ -1,5 +1,105 @@ ++use std::collections::HashMap; ++ ++use grep_matcher::{Match, Matcher, NoError}; ++use regex::bytes::Regex; + use regex_syntax::hir::{self, Hir, HirKind}; + ++use config::ConfiguredHIR; ++use error::Error; ++use matcher::RegexCaptures; ++ ++/// A matcher for implementing "word match" semantics. ++#[derive(Clone, Debug)] ++pub struct CRLFMatcher { ++ /// The regex. ++ regex: Regex, ++ /// A map from capture group name to capture group index. ++ names: HashMap, ++} ++ ++impl CRLFMatcher { ++ /// Create a new matcher from the given pattern that strips `\r` from the ++ /// end of every match. ++ /// ++ /// This panics if the given expression doesn't need its CRLF stripped. ++ pub fn new(expr: &ConfiguredHIR) -> Result { ++ assert!(expr.needs_crlf_stripped()); ++ ++ let regex = expr.regex()?; ++ let mut names = HashMap::new(); ++ for (i, optional_name) in regex.capture_names().enumerate() { ++ if let Some(name) = optional_name { ++ names.insert(name.to_string(), i.checked_sub(1).unwrap()); ++ } ++ } ++ Ok(CRLFMatcher { regex, names }) ++ } ++} ++ ++impl Matcher for CRLFMatcher { ++ type Captures = RegexCaptures; ++ type Error = NoError; ++ ++ fn find_at( ++ &self, ++ haystack: &[u8], ++ at: usize, ++ ) -> Result, NoError> { ++ let m = match self.regex.find_at(haystack, at) { ++ None => return Ok(None), ++ Some(m) => Match::new(m.start(), m.end()), ++ }; ++ Ok(Some(adjust_match(haystack, m))) ++ } ++ ++ fn new_captures(&self) -> Result { ++ Ok(RegexCaptures::new(self.regex.capture_locations())) ++ } ++ ++ fn capture_count(&self) -> usize { ++ self.regex.captures_len().checked_sub(1).unwrap() ++ } ++ ++ fn capture_index(&self, name: &str) -> Option { ++ self.names.get(name).map(|i| *i) ++ } ++ ++ fn captures_at( ++ &self, ++ haystack: &[u8], ++ at: usize, ++ caps: &mut RegexCaptures, ++ ) -> Result { ++ caps.strip_crlf(false); ++ let r = self.regex.captures_read_at(caps.locations(), haystack, at); ++ if !r.is_some() { ++ return Ok(false); ++ } ++ ++ // If the end of our match includes a `\r`, then strip it from all ++ // capture groups ending at the same location. ++ let end = caps.locations().get(0).unwrap().1; ++ if end > 0 && haystack.get(end - 1) == Some(&b'\r') { ++ caps.strip_crlf(true); ++ } ++ Ok(true) ++ } ++ ++ // We specifically do not implement other methods like find_iter or ++ // captures_iter. Namely, the iter methods are guaranteed to be correct ++ // by virtue of implementing find_at and captures_at above. ++} ++ ++/// If the given match ends with a `\r`, then return a new match that ends ++/// immediately before the `\r`. ++pub fn adjust_match(haystack: &[u8], m: Match) -> Match { ++ if m.end() > 0 && haystack.get(m.end() - 1) == Some(&b'\r') { ++ m.with_end(m.end() - 1) ++ } else { ++ m ++ } ++} ++ + /// Substitutes all occurrences of multi-line enabled `$` with `(?:\r?$)`. + /// + /// This does not preserve the exact semantics of the given expression, +diff --git a/src/matcher.rs b/src/matcher.rs +index 831d573..b15d824 100644 +--- a/src/matcher.rs ++++ b/src/matcher.rs +@@ -6,6 +6,7 @@ use grep_matcher::{ + use regex::bytes::{CaptureLocations, Regex}; + + use config::{Config, ConfiguredHIR}; ++use crlf::CRLFMatcher; + use error::Error; + use word::WordMatcher; + +@@ -344,6 +345,11 @@ impl RegexMatcher { + enum RegexMatcherImpl { + /// The standard matcher used for all regular expressions. + Standard(StandardMatcher), ++ /// A matcher that strips `\r` from the end of matches. ++ /// ++ /// This is only used when the CRLF hack is enabled and the regex is line ++ /// anchored at the end. ++ CRLF(CRLFMatcher), + /// A matcher that only matches at word boundaries. This transforms the + /// regex to `(^|\W)(...)($|\W)` instead of the more intuitive `\b(...)\b`. + /// Because of this, the WordMatcher provides its own implementation of +@@ -358,6 +364,8 @@ impl RegexMatcherImpl { + fn new(expr: &ConfiguredHIR) -> Result { + if expr.config().word { + Ok(RegexMatcherImpl::Word(WordMatcher::new(expr)?)) ++ } else if expr.needs_crlf_stripped() { ++ Ok(RegexMatcherImpl::CRLF(CRLFMatcher::new(expr)?)) + } else { + Ok(RegexMatcherImpl::Standard(StandardMatcher::new(expr)?)) + } +@@ -379,6 +387,7 @@ impl Matcher for RegexMatcher { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.find_at(haystack, at), ++ CRLF(ref m) => m.find_at(haystack, at), + Word(ref m) => m.find_at(haystack, at), + } + } +@@ -387,6 +396,7 @@ impl Matcher for RegexMatcher { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.new_captures(), ++ CRLF(ref m) => m.new_captures(), + Word(ref m) => m.new_captures(), + } + } +@@ -395,6 +405,7 @@ impl Matcher for RegexMatcher { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.capture_count(), ++ CRLF(ref m) => m.capture_count(), + Word(ref m) => m.capture_count(), + } + } +@@ -403,6 +414,7 @@ impl Matcher for RegexMatcher { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.capture_index(name), ++ CRLF(ref m) => m.capture_index(name), + Word(ref m) => m.capture_index(name), + } + } +@@ -411,6 +423,7 @@ impl Matcher for RegexMatcher { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.find(haystack), ++ CRLF(ref m) => m.find(haystack), + Word(ref m) => m.find(haystack), + } + } +@@ -425,6 +438,7 @@ impl Matcher for RegexMatcher { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.find_iter(haystack, matched), ++ CRLF(ref m) => m.find_iter(haystack, matched), + Word(ref m) => m.find_iter(haystack, matched), + } + } +@@ -439,6 +453,7 @@ impl Matcher for RegexMatcher { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.try_find_iter(haystack, matched), ++ CRLF(ref m) => m.try_find_iter(haystack, matched), + Word(ref m) => m.try_find_iter(haystack, matched), + } + } +@@ -451,6 +466,7 @@ impl Matcher for RegexMatcher { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.captures(haystack, caps), ++ CRLF(ref m) => m.captures(haystack, caps), + Word(ref m) => m.captures(haystack, caps), + } + } +@@ -466,6 +482,7 @@ impl Matcher for RegexMatcher { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.captures_iter(haystack, caps, matched), ++ CRLF(ref m) => m.captures_iter(haystack, caps, matched), + Word(ref m) => m.captures_iter(haystack, caps, matched), + } + } +@@ -481,6 +498,7 @@ impl Matcher for RegexMatcher { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.try_captures_iter(haystack, caps, matched), ++ CRLF(ref m) => m.try_captures_iter(haystack, caps, matched), + Word(ref m) => m.try_captures_iter(haystack, caps, matched), + } + } +@@ -494,6 +512,7 @@ impl Matcher for RegexMatcher { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.captures_at(haystack, at, caps), ++ CRLF(ref m) => m.captures_at(haystack, at, caps), + Word(ref m) => m.captures_at(haystack, at, caps), + } + } +@@ -509,6 +528,7 @@ impl Matcher for RegexMatcher { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.replace(haystack, dst, append), ++ CRLF(ref m) => m.replace(haystack, dst, append), + Word(ref m) => m.replace(haystack, dst, append), + } + } +@@ -527,6 +547,9 @@ impl Matcher for RegexMatcher { + Standard(ref m) => { + m.replace_with_captures(haystack, caps, dst, append) + } ++ CRLF(ref m) => { ++ m.replace_with_captures(haystack, caps, dst, append) ++ } + Word(ref m) => { + m.replace_with_captures(haystack, caps, dst, append) + } +@@ -537,6 +560,7 @@ impl Matcher for RegexMatcher { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.is_match(haystack), ++ CRLF(ref m) => m.is_match(haystack), + Word(ref m) => m.is_match(haystack), + } + } +@@ -549,6 +573,7 @@ impl Matcher for RegexMatcher { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.is_match_at(haystack, at), ++ CRLF(ref m) => m.is_match_at(haystack, at), + Word(ref m) => m.is_match_at(haystack, at), + } + } +@@ -560,6 +585,7 @@ impl Matcher for RegexMatcher { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.shortest_match(haystack), ++ CRLF(ref m) => m.shortest_match(haystack), + Word(ref m) => m.shortest_match(haystack), + } + } +@@ -572,6 +598,7 @@ impl Matcher for RegexMatcher { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.shortest_match_at(haystack, at), ++ CRLF(ref m) => m.shortest_match_at(haystack, at), + Word(ref m) => m.shortest_match_at(haystack, at), + } + } +@@ -712,6 +739,9 @@ pub struct RegexCaptures { + /// and the capturing groups must behave as if `(re)` is the `0`th capture + /// group. + offset: usize, ++ /// When enable, the end of a match has `\r` stripped from it, if one ++ /// exists. ++ strip_crlf: bool, + } + + impl Captures for RegexCaptures { +@@ -720,8 +750,25 @@ impl Captures for RegexCaptures { + } + + fn get(&self, i: usize) -> Option { +- let actual = i.checked_add(self.offset).unwrap(); +- self.locs.pos(actual).map(|(s, e)| Match::new(s, e)) ++ if !self.strip_crlf { ++ let actual = i.checked_add(self.offset).unwrap(); ++ return self.locs.pos(actual).map(|(s, e)| Match::new(s, e)); ++ } ++ ++ // currently don't support capture offsetting with CRLF stripping ++ assert_eq!(self.offset, 0); ++ let m = match self.locs.pos(i).map(|(s, e)| Match::new(s, e)) { ++ None => return None, ++ Some(m) => m, ++ }; ++ // If the end position of this match corresponds to the end position ++ // of the overall match, then we apply our CRLF stripping. Otherwise, ++ // we cannot assume stripping is correct. ++ if i == 0 || m.end() == self.locs.pos(0).unwrap().1 { ++ Some(m.with_end(m.end() - 1)) ++ } else { ++ Some(m) ++ } + } + } + +@@ -734,12 +781,16 @@ impl RegexCaptures { + locs: CaptureLocations, + offset: usize, + ) -> RegexCaptures { +- RegexCaptures { locs, offset } ++ RegexCaptures { locs, offset, strip_crlf: false } + } + + pub(crate) fn locations(&mut self) -> &mut CaptureLocations { + &mut self.locs + } ++ ++ pub(crate) fn strip_crlf(&mut self, yes: bool) { ++ self.strip_crlf = yes; ++ } + } + + #[cfg(test)] +-- +2.20.1 + diff --git a/grep-regex-fix-metadata.diff b/grep-regex-fix-metadata.diff new file mode 100644 index 0000000..0fb1b59 --- /dev/null +++ b/grep-regex-fix-metadata.diff @@ -0,0 +1,11 @@ +--- grep-regex-0.1.1/Cargo.toml 1970-01-01T01:00:00+01:00 ++++ grep-regex-0.1.1/Cargo.toml 2019-02-14T09:25:19.720854+01:00 +@@ -31,7 +31,7 @@ + version = "1.0.5" + + [dependencies.regex-syntax] +-version = "0.6.2" ++version = "0.6.5" + + [dependencies.thread_local] + version = "0.3.6" diff --git a/rust-grep-regex.spec b/rust-grep-regex.spec index 16c639e..c09159a 100644 --- a/rust-grep-regex.spec +++ b/rust-grep-regex.spec @@ -7,20 +7,25 @@ Name: rust-%{crate} Version: 0.1.1 -Release: 4%{?dist} +Release: 5%{?dist} Summary: Use Rust's regex library with the 'grep' crate # Upstream license specification: Unlicense/MIT License: Unlicense or MIT URL: https://crates.io/crates/grep-regex -Source0: https://crates.io/api/v1/crates/%{crate}/%{version}/download#/%{crate}-%{version}.crate +Source: %{crates_source} +# Initial patched metadata +# * Bump regex-syntax to 0.6.5 (for patch0001) +Patch0: grep-regex-fix-metadata.diff +# https://github.com/BurntSushi/ripgrep/commit/9d703110cfe01782d2d0b03a340f5983da215e68 +Patch0001: 0001-regex-make-CRLF-hack-more-robust.patch ExclusiveArch: %{rust_arches} BuildRequires: rust-packaging BuildRequires: (crate(grep-matcher/default) >= 0.1.1 with crate(grep-matcher/default) < 0.2.0) BuildRequires: (crate(log/default) >= 0.4.5 with crate(log/default) < 0.5.0) -BuildRequires: (crate(regex-syntax/default) >= 0.6.2 with crate(regex-syntax/default) < 0.7.0) +BuildRequires: (crate(regex-syntax/default) >= 0.6.5 with crate(regex-syntax/default) < 0.7.0) BuildRequires: (crate(regex/default) >= 1.0.5 with crate(regex/default) < 2.0.0) BuildRequires: (crate(thread_local/default) >= 0.3.6 with crate(thread_local/default) < 0.4.0) BuildRequires: (crate(utf8-ranges/default) >= 1.0.1 with crate(utf8-ranges/default) < 2.0.0) @@ -57,7 +62,7 @@ which use "default" feature of "%{crate}" crate. %ghost %{cargo_registry}/%{crate}-%{version}/Cargo.toml %prep -%autosetup -n %{crate}-%{version} -p1 +%autosetup -n %{crate}-%{version_no_tilde} -p1 %cargo_prep %build @@ -72,6 +77,9 @@ which use "default" feature of "%{crate}" crate. %endif %changelog +* Thu Feb 14 2019 Igor Gnatenko - 0.1.1-5 +- Backport fix for CRLF handling + * Sat Feb 02 2019 Fedora Release Engineering - 0.1.1-4 - Rebuilt for https://fedoraproject.org/wiki/Fedora_30_Mass_Rebuild