From 00bd1f2f1d57c1bf1534694fee0aeeb4999cc2c5 Mon Sep 17 00:00:00 2001 From: Igor Gnatenko Date: Sat, 16 Feb 2019 17:39:08 +0100 Subject: [PATCH] Update to 0.1.2 Signed-off-by: Igor Gnatenko --- .gitignore | 1 + 0001-regex-make-CRLF-hack-more-robust.patch | 387 -------------------- grep-regex-fix-metadata.diff | 11 - rust-grep-regex.spec | 14 +- sources | 2 +- 5 files changed, 8 insertions(+), 407 deletions(-) delete mode 100644 0001-regex-make-CRLF-hack-more-robust.patch delete mode 100644 grep-regex-fix-metadata.diff diff --git a/.gitignore b/.gitignore index 2930121..11b692a 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /grep-regex-0.1.1.crate +/grep-regex-0.1.2.crate diff --git a/0001-regex-make-CRLF-hack-more-robust.patch b/0001-regex-make-CRLF-hack-more-robust.patch deleted file mode 100644 index efaf64e..0000000 --- a/0001-regex-make-CRLF-hack-more-robust.patch +++ /dev/null @@ -1,387 +0,0 @@ -From 9d703110cfe01782d2d0b03a340f5983da215e68 Mon Sep 17 00:00:00 2001 -From: Andrew Gallant -Date: Sat, 26 Jan 2019 12:25:21 -0500 -Subject: [PATCH] regex: make CRLF hack more robust - -This commit improves the CRLF hack to be more robust. In particular, in -addition to rewriting `$` as `(?:\r??$)`, we now strip `\r` from the end -of a match if and only if the regex has an ending line anchor required for -a match. This doesn't quite make the hack 100% correct, but should fix most -use cases in practice. An example of a regex that will still be incorrect -is `foo|bar$`, since the analysis isn't quite sophisticated enough to -determine that a `\r` can be safely stripped from any match. Even if we -fix that, regexes like `foo\r|bar$` still won't be handled correctly. Alas, -more work on this front should really be focused on enabling this in the -regex engine itself. - -The specific cause of this bug was that grep-searcher was sneakily -stripping CRLF from matching lines when it really shouldn't have. We remove -that code now, and instead rely on better match semantics provided at a -lower level. - -Fixes #1095 ---- - src/config.rs | 8 ++++ - src/crlf.rs | 100 +++++++++++++++++++++++++++++++++++++++++++++++++ - src/matcher.rs | 57 ++++++++++++++++++++++++++-- - 3 files changed, 162 insertions(+), 3 deletions(-) - -diff --git a/src/config.rs b/src/config.rs -index f3d1f1c..efed9d4 100644 ---- a/src/config.rs -+++ b/src/config.rs -@@ -160,6 +160,14 @@ impl ConfiguredHIR { - non_matching_bytes(&self.expr) - } - -+ /// Returns true if and only if this regex needs to have its match offsets -+ /// tweaked because of CRLF support. Specifically, this occurs when the -+ /// CRLF hack is enabled and the regex is line anchored at the end. In -+ /// this case, matches that end with a `\r` have the `\r` stripped. -+ pub fn needs_crlf_stripped(&self) -> bool { -+ self.config.crlf && self.expr.is_line_anchored_end() -+ } -+ - /// Builds a regular expression from this HIR expression. - pub fn regex(&self) -> Result { - self.pattern_to_regex(&self.expr.to_string()) -diff --git a/src/crlf.rs b/src/crlf.rs -index ff6b15b..838a28f 100644 ---- a/src/crlf.rs -+++ b/src/crlf.rs -@@ -1,5 +1,105 @@ -+use std::collections::HashMap; -+ -+use grep_matcher::{Match, Matcher, NoError}; -+use regex::bytes::Regex; - use regex_syntax::hir::{self, Hir, HirKind}; - -+use config::ConfiguredHIR; -+use error::Error; -+use matcher::RegexCaptures; -+ -+/// A matcher for implementing "word match" semantics. -+#[derive(Clone, Debug)] -+pub struct CRLFMatcher { -+ /// The regex. -+ regex: Regex, -+ /// A map from capture group name to capture group index. -+ names: HashMap, -+} -+ -+impl CRLFMatcher { -+ /// Create a new matcher from the given pattern that strips `\r` from the -+ /// end of every match. -+ /// -+ /// This panics if the given expression doesn't need its CRLF stripped. -+ pub fn new(expr: &ConfiguredHIR) -> Result { -+ assert!(expr.needs_crlf_stripped()); -+ -+ let regex = expr.regex()?; -+ let mut names = HashMap::new(); -+ for (i, optional_name) in regex.capture_names().enumerate() { -+ if let Some(name) = optional_name { -+ names.insert(name.to_string(), i.checked_sub(1).unwrap()); -+ } -+ } -+ Ok(CRLFMatcher { regex, names }) -+ } -+} -+ -+impl Matcher for CRLFMatcher { -+ type Captures = RegexCaptures; -+ type Error = NoError; -+ -+ fn find_at( -+ &self, -+ haystack: &[u8], -+ at: usize, -+ ) -> Result, NoError> { -+ let m = match self.regex.find_at(haystack, at) { -+ None => return Ok(None), -+ Some(m) => Match::new(m.start(), m.end()), -+ }; -+ Ok(Some(adjust_match(haystack, m))) -+ } -+ -+ fn new_captures(&self) -> Result { -+ Ok(RegexCaptures::new(self.regex.capture_locations())) -+ } -+ -+ fn capture_count(&self) -> usize { -+ self.regex.captures_len().checked_sub(1).unwrap() -+ } -+ -+ fn capture_index(&self, name: &str) -> Option { -+ self.names.get(name).map(|i| *i) -+ } -+ -+ fn captures_at( -+ &self, -+ haystack: &[u8], -+ at: usize, -+ caps: &mut RegexCaptures, -+ ) -> Result { -+ caps.strip_crlf(false); -+ let r = self.regex.captures_read_at(caps.locations(), haystack, at); -+ if !r.is_some() { -+ return Ok(false); -+ } -+ -+ // If the end of our match includes a `\r`, then strip it from all -+ // capture groups ending at the same location. -+ let end = caps.locations().get(0).unwrap().1; -+ if end > 0 && haystack.get(end - 1) == Some(&b'\r') { -+ caps.strip_crlf(true); -+ } -+ Ok(true) -+ } -+ -+ // We specifically do not implement other methods like find_iter or -+ // captures_iter. Namely, the iter methods are guaranteed to be correct -+ // by virtue of implementing find_at and captures_at above. -+} -+ -+/// If the given match ends with a `\r`, then return a new match that ends -+/// immediately before the `\r`. -+pub fn adjust_match(haystack: &[u8], m: Match) -> Match { -+ if m.end() > 0 && haystack.get(m.end() - 1) == Some(&b'\r') { -+ m.with_end(m.end() - 1) -+ } else { -+ m -+ } -+} -+ - /// Substitutes all occurrences of multi-line enabled `$` with `(?:\r?$)`. - /// - /// This does not preserve the exact semantics of the given expression, -diff --git a/src/matcher.rs b/src/matcher.rs -index 831d573..b15d824 100644 ---- a/src/matcher.rs -+++ b/src/matcher.rs -@@ -6,6 +6,7 @@ use grep_matcher::{ - use regex::bytes::{CaptureLocations, Regex}; - - use config::{Config, ConfiguredHIR}; -+use crlf::CRLFMatcher; - use error::Error; - use word::WordMatcher; - -@@ -344,6 +345,11 @@ impl RegexMatcher { - enum RegexMatcherImpl { - /// The standard matcher used for all regular expressions. - Standard(StandardMatcher), -+ /// A matcher that strips `\r` from the end of matches. -+ /// -+ /// This is only used when the CRLF hack is enabled and the regex is line -+ /// anchored at the end. -+ CRLF(CRLFMatcher), - /// A matcher that only matches at word boundaries. This transforms the - /// regex to `(^|\W)(...)($|\W)` instead of the more intuitive `\b(...)\b`. - /// Because of this, the WordMatcher provides its own implementation of -@@ -358,6 +364,8 @@ impl RegexMatcherImpl { - fn new(expr: &ConfiguredHIR) -> Result { - if expr.config().word { - Ok(RegexMatcherImpl::Word(WordMatcher::new(expr)?)) -+ } else if expr.needs_crlf_stripped() { -+ Ok(RegexMatcherImpl::CRLF(CRLFMatcher::new(expr)?)) - } else { - Ok(RegexMatcherImpl::Standard(StandardMatcher::new(expr)?)) - } -@@ -379,6 +387,7 @@ impl Matcher for RegexMatcher { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.find_at(haystack, at), -+ CRLF(ref m) => m.find_at(haystack, at), - Word(ref m) => m.find_at(haystack, at), - } - } -@@ -387,6 +396,7 @@ impl Matcher for RegexMatcher { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.new_captures(), -+ CRLF(ref m) => m.new_captures(), - Word(ref m) => m.new_captures(), - } - } -@@ -395,6 +405,7 @@ impl Matcher for RegexMatcher { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.capture_count(), -+ CRLF(ref m) => m.capture_count(), - Word(ref m) => m.capture_count(), - } - } -@@ -403,6 +414,7 @@ impl Matcher for RegexMatcher { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.capture_index(name), -+ CRLF(ref m) => m.capture_index(name), - Word(ref m) => m.capture_index(name), - } - } -@@ -411,6 +423,7 @@ impl Matcher for RegexMatcher { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.find(haystack), -+ CRLF(ref m) => m.find(haystack), - Word(ref m) => m.find(haystack), - } - } -@@ -425,6 +438,7 @@ impl Matcher for RegexMatcher { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.find_iter(haystack, matched), -+ CRLF(ref m) => m.find_iter(haystack, matched), - Word(ref m) => m.find_iter(haystack, matched), - } - } -@@ -439,6 +453,7 @@ impl Matcher for RegexMatcher { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.try_find_iter(haystack, matched), -+ CRLF(ref m) => m.try_find_iter(haystack, matched), - Word(ref m) => m.try_find_iter(haystack, matched), - } - } -@@ -451,6 +466,7 @@ impl Matcher for RegexMatcher { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.captures(haystack, caps), -+ CRLF(ref m) => m.captures(haystack, caps), - Word(ref m) => m.captures(haystack, caps), - } - } -@@ -466,6 +482,7 @@ impl Matcher for RegexMatcher { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.captures_iter(haystack, caps, matched), -+ CRLF(ref m) => m.captures_iter(haystack, caps, matched), - Word(ref m) => m.captures_iter(haystack, caps, matched), - } - } -@@ -481,6 +498,7 @@ impl Matcher for RegexMatcher { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.try_captures_iter(haystack, caps, matched), -+ CRLF(ref m) => m.try_captures_iter(haystack, caps, matched), - Word(ref m) => m.try_captures_iter(haystack, caps, matched), - } - } -@@ -494,6 +512,7 @@ impl Matcher for RegexMatcher { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.captures_at(haystack, at, caps), -+ CRLF(ref m) => m.captures_at(haystack, at, caps), - Word(ref m) => m.captures_at(haystack, at, caps), - } - } -@@ -509,6 +528,7 @@ impl Matcher for RegexMatcher { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.replace(haystack, dst, append), -+ CRLF(ref m) => m.replace(haystack, dst, append), - Word(ref m) => m.replace(haystack, dst, append), - } - } -@@ -527,6 +547,9 @@ impl Matcher for RegexMatcher { - Standard(ref m) => { - m.replace_with_captures(haystack, caps, dst, append) - } -+ CRLF(ref m) => { -+ m.replace_with_captures(haystack, caps, dst, append) -+ } - Word(ref m) => { - m.replace_with_captures(haystack, caps, dst, append) - } -@@ -537,6 +560,7 @@ impl Matcher for RegexMatcher { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.is_match(haystack), -+ CRLF(ref m) => m.is_match(haystack), - Word(ref m) => m.is_match(haystack), - } - } -@@ -549,6 +573,7 @@ impl Matcher for RegexMatcher { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.is_match_at(haystack, at), -+ CRLF(ref m) => m.is_match_at(haystack, at), - Word(ref m) => m.is_match_at(haystack, at), - } - } -@@ -560,6 +585,7 @@ impl Matcher for RegexMatcher { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.shortest_match(haystack), -+ CRLF(ref m) => m.shortest_match(haystack), - Word(ref m) => m.shortest_match(haystack), - } - } -@@ -572,6 +598,7 @@ impl Matcher for RegexMatcher { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.shortest_match_at(haystack, at), -+ CRLF(ref m) => m.shortest_match_at(haystack, at), - Word(ref m) => m.shortest_match_at(haystack, at), - } - } -@@ -712,6 +739,9 @@ pub struct RegexCaptures { - /// and the capturing groups must behave as if `(re)` is the `0`th capture - /// group. - offset: usize, -+ /// When enable, the end of a match has `\r` stripped from it, if one -+ /// exists. -+ strip_crlf: bool, - } - - impl Captures for RegexCaptures { -@@ -720,8 +750,25 @@ impl Captures for RegexCaptures { - } - - fn get(&self, i: usize) -> Option { -- let actual = i.checked_add(self.offset).unwrap(); -- self.locs.pos(actual).map(|(s, e)| Match::new(s, e)) -+ if !self.strip_crlf { -+ let actual = i.checked_add(self.offset).unwrap(); -+ return self.locs.pos(actual).map(|(s, e)| Match::new(s, e)); -+ } -+ -+ // currently don't support capture offsetting with CRLF stripping -+ assert_eq!(self.offset, 0); -+ let m = match self.locs.pos(i).map(|(s, e)| Match::new(s, e)) { -+ None => return None, -+ Some(m) => m, -+ }; -+ // If the end position of this match corresponds to the end position -+ // of the overall match, then we apply our CRLF stripping. Otherwise, -+ // we cannot assume stripping is correct. -+ if i == 0 || m.end() == self.locs.pos(0).unwrap().1 { -+ Some(m.with_end(m.end() - 1)) -+ } else { -+ Some(m) -+ } - } - } - -@@ -734,12 +781,16 @@ impl RegexCaptures { - locs: CaptureLocations, - offset: usize, - ) -> RegexCaptures { -- RegexCaptures { locs, offset } -+ RegexCaptures { locs, offset, strip_crlf: false } - } - - pub(crate) fn locations(&mut self) -> &mut CaptureLocations { - &mut self.locs - } -+ -+ pub(crate) fn strip_crlf(&mut self, yes: bool) { -+ self.strip_crlf = yes; -+ } - } - - #[cfg(test)] --- -2.20.1 - diff --git a/grep-regex-fix-metadata.diff b/grep-regex-fix-metadata.diff deleted file mode 100644 index 0fb1b59..0000000 --- a/grep-regex-fix-metadata.diff +++ /dev/null @@ -1,11 +0,0 @@ ---- grep-regex-0.1.1/Cargo.toml 1970-01-01T01:00:00+01:00 -+++ grep-regex-0.1.1/Cargo.toml 2019-02-14T09:25:19.720854+01:00 -@@ -31,7 +31,7 @@ - version = "1.0.5" - - [dependencies.regex-syntax] --version = "0.6.2" -+version = "0.6.5" - - [dependencies.thread_local] - version = "0.3.6" diff --git a/rust-grep-regex.spec b/rust-grep-regex.spec index c09159a..c09c074 100644 --- a/rust-grep-regex.spec +++ b/rust-grep-regex.spec @@ -6,19 +6,14 @@ %global crate grep-regex Name: rust-%{crate} -Version: 0.1.1 -Release: 5%{?dist} +Version: 0.1.2 +Release: 1%{?dist} Summary: Use Rust's regex library with the 'grep' crate # Upstream license specification: Unlicense/MIT License: Unlicense or MIT URL: https://crates.io/crates/grep-regex Source: %{crates_source} -# Initial patched metadata -# * Bump regex-syntax to 0.6.5 (for patch0001) -Patch0: grep-regex-fix-metadata.diff -# https://github.com/BurntSushi/ripgrep/commit/9d703110cfe01782d2d0b03a340f5983da215e68 -Patch0001: 0001-regex-make-CRLF-hack-more-robust.patch ExclusiveArch: %{rust_arches} @@ -26,7 +21,7 @@ BuildRequires: rust-packaging BuildRequires: (crate(grep-matcher/default) >= 0.1.1 with crate(grep-matcher/default) < 0.2.0) BuildRequires: (crate(log/default) >= 0.4.5 with crate(log/default) < 0.5.0) BuildRequires: (crate(regex-syntax/default) >= 0.6.5 with crate(regex-syntax/default) < 0.7.0) -BuildRequires: (crate(regex/default) >= 1.0.5 with crate(regex/default) < 2.0.0) +BuildRequires: (crate(regex/default) >= 1.1.0 with crate(regex/default) < 2.0.0) BuildRequires: (crate(thread_local/default) >= 0.3.6 with crate(thread_local/default) < 0.4.0) BuildRequires: (crate(utf8-ranges/default) >= 1.0.1 with crate(utf8-ranges/default) < 2.0.0) @@ -77,6 +72,9 @@ which use "default" feature of "%{crate}" crate. %endif %changelog +* Sat Feb 16 2019 Igor Gnatenko - 0.1.2-1 +- Update to 0.1.2 + * Thu Feb 14 2019 Igor Gnatenko - 0.1.1-5 - Backport fix for CRLF handling diff --git a/sources b/sources index 258c53b..828bd23 100644 --- a/sources +++ b/sources @@ -1 +1 @@ -SHA512 (grep-regex-0.1.1.crate) = 281585dca0d8e42778bda7a23780ddef38d31145c36e3ea831b1edb98be66c87bba553303fb973af1211aef21d1deb11d2f9466947fb0622b2d1bf180af01a1f +SHA512 (grep-regex-0.1.2.crate) = 4ccaeda895b554969398dd1875e486b1994c7ef539277e5f4f30549b8d65a18228a48a5375ade1638ad7c26fd79d57ef0273e6d1b14dc285d9690bd63572872d