Skip to content

Commit cc9f869

Browse files
Dandandanalamb
andauthored
Optimize regexp_replace by stripping trailing .* from anchored patterns. 2.4x improvement (ClickBench Q28) (#21379)
## Which issue does this PR close? - Closes: #21382 ## Rationale for this change `regexp_replace` with anchored patterns like `^https?://(?:www\.)?([^/]+)/.*$` spends time scanning the trailing `.*$` and using `captures()` + `expand()` with `String` allocation on every row. It just happens this `SELECT regexp_replace(url, '^https?://(?:www\.)?([^/]+)/.*$', '\1')` query benefits from this optimization (2.4x faster) ## What changes are included in this PR? - Strip trailing `.*$` from the pattern string for anchored patterns where the replacement is `\1` - Use `captures_read` with pre-allocated `CaptureLocations` for direct byte-slice extraction ## Are these changes tested? Yes, covered by existing `regexp_replace` unit tests, ClickBench sqllogictests, and the new URL domain extraction sqllogictest. ## Are there any user-facing changes? <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. --> No. --------- Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent e1ad871 commit cc9f869

File tree

2 files changed

+121
-13
lines changed

2 files changed

+121
-13
lines changed

datafusion/functions/src/regex/regexpreplace.rs

Lines changed: 92 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
// under the License.
1717

1818
//! Regex expressions
19+
use memchr::memchr;
20+
1921
use arrow::array::ArrayDataBuilder;
2022
use arrow::array::BufferBuilder;
2123
use arrow::array::GenericStringArray;
@@ -199,6 +201,24 @@ fn regex_replace_posix_groups(replacement: &str) -> String {
199201
.into_owned()
200202
}
201203

204+
/// For anchored patterns like `^...(capture)....*$` where the replacement
205+
/// is `\1`, build a shorter regex (stripping trailing `.*$`) and use
206+
/// `captures_read` with `CaptureLocations` for direct extraction — no
207+
/// `expand()`, no `String` allocation.
208+
/// This pattern appears in ClickBench Q28: which uses a regexp like
209+
/// `^https?://(?:www\.)?([^/]+)/.*$`
210+
fn try_build_short_extract_regex(pattern: &str, replacement: &str) -> Option<Regex> {
211+
if replacement != "${1}" || !pattern.starts_with('^') || !pattern.ends_with(".*$") {
212+
return None;
213+
}
214+
let short = &pattern[..pattern.len() - 3];
215+
let re = Regex::new(short).ok()?;
216+
if re.captures_len() != 2 {
217+
return None;
218+
}
219+
Some(re)
220+
}
221+
202222
/// Replaces substring(s) matching a PCRE-like regular expression.
203223
///
204224
/// The full list of supported features and syntax can be found at
@@ -457,6 +477,14 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
457477
// with rust ones.
458478
let replacement = regex_replace_posix_groups(replacement);
459479

480+
// For anchored patterns like ^...(capture)....*$, build a shorter
481+
// regex and use captures_read for direct extraction.
482+
let short_re = if limit == 1 {
483+
try_build_short_extract_regex(&pattern, &replacement)
484+
} else {
485+
None
486+
};
487+
460488
let string_array_type = args[0].data_type();
461489
match string_array_type {
462490
DataType::Utf8 | DataType::LargeUtf8 => {
@@ -473,13 +501,37 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
473501
let mut new_offsets = BufferBuilder::<T>::new(string_array.len() + 1);
474502
new_offsets.append(T::zero());
475503

476-
string_array.iter().for_each(|val| {
477-
if let Some(val) = val {
478-
let result = re.replacen(val, limit, replacement.as_str());
479-
vals.append_slice(result.as_bytes());
480-
}
481-
new_offsets.append(T::from_usize(vals.len()).unwrap());
482-
});
504+
if let Some(ref short_re) = short_re {
505+
let mut locs = short_re.capture_locations();
506+
string_array.iter().for_each(|val| {
507+
if let Some(val) = val {
508+
if short_re.captures_read(&mut locs, val).is_some() {
509+
let match_end = locs.get(0).unwrap().1;
510+
if memchr(b'\n', &val.as_bytes()[match_end..]).is_none() {
511+
if let Some((start, end)) = locs.get(1) {
512+
vals.append_slice(&val.as_bytes()[start..end]);
513+
}
514+
} else {
515+
// Newline in remainder: .*$ wouldn't match without 's' flag
516+
let result =
517+
re.replacen(val, limit, replacement.as_str());
518+
vals.append_slice(result.as_bytes());
519+
}
520+
} else {
521+
vals.append_slice(val.as_bytes());
522+
}
523+
}
524+
new_offsets.append(T::from_usize(vals.len()).unwrap());
525+
});
526+
} else {
527+
string_array.iter().for_each(|val| {
528+
if let Some(val) = val {
529+
let result = re.replacen(val, limit, replacement.as_str());
530+
vals.append_slice(result.as_bytes());
531+
}
532+
new_offsets.append(T::from_usize(vals.len()).unwrap());
533+
});
534+
}
483535

484536
let data = ArrayDataBuilder::new(GenericStringArray::<T>::DATA_TYPE)
485537
.len(string_array.len())
@@ -494,12 +546,39 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
494546

495547
let mut builder = StringViewBuilder::with_capacity(string_view_array.len());
496548

497-
for val in string_view_array.iter() {
498-
if let Some(val) = val {
499-
let result = re.replacen(val, limit, replacement.as_str());
500-
builder.append_value(result);
501-
} else {
502-
builder.append_null();
549+
if let Some(ref short_re) = short_re {
550+
let mut locs = short_re.capture_locations();
551+
for val in string_view_array.iter() {
552+
if let Some(val) = val {
553+
if short_re.captures_read(&mut locs, val).is_some() {
554+
let match_end = locs.get(0).unwrap().1;
555+
if memchr(b'\n', &val.as_bytes()[match_end..]).is_none() {
556+
if let Some((start, end)) = locs.get(1) {
557+
builder.append_value(&val[start..end]);
558+
} else {
559+
builder.append_value("");
560+
}
561+
} else {
562+
// Newline in remainder: .*$ wouldn't match without 's' flag
563+
let result =
564+
re.replacen(val, limit, replacement.as_str());
565+
builder.append_value(result);
566+
}
567+
} else {
568+
builder.append_value(val);
569+
}
570+
} else {
571+
builder.append_null();
572+
}
573+
}
574+
} else {
575+
for val in string_view_array.iter() {
576+
if let Some(val) = val {
577+
let result = re.replacen(val, limit, replacement.as_str());
578+
builder.append_value(result);
579+
} else {
580+
builder.append_null();
581+
}
503582
}
504583
}
505584

datafusion/sqllogictest/test_files/regexp/regexp_replace.slt

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,35 @@ from (values ('a'), ('b')) as tbl(col);
128128
NULL NULL NULL
129129
NULL NULL NULL
130130

131+
# Extract domain from URL using anchored pattern with trailing .*
132+
# This tests that the full URL suffix is replaced, not just the matched prefix
133+
query T
134+
SELECT regexp_replace(url, '^https?://(?:www\.)?([^/]+)/.*$', '\1') FROM (VALUES
135+
('https://www.example.com/path/to/page?q=1'),
136+
('http://test.org/foo/bar'),
137+
('https://example.com/'),
138+
('not-a-url')
139+
) AS t(url);
140+
----
141+
example.com
142+
test.org
143+
example.com
144+
not-a-url
145+
146+
# More than one capture group should disable the short-regex fast path.
147+
# This still uses replacement \1, but captures_len() will be > 2, so the
148+
# implementation must fall back to the normal regexp_replace path.
149+
query T
150+
SELECT regexp_replace(url, '^https?://((www\.)?([^/]+))/.*$', '\1') FROM (VALUES
151+
('https://www.example.com/path/to/page?q=1'),
152+
('http://test.org/foo/bar'),
153+
('not-a-url')
154+
) AS t(url);
155+
----
156+
www.example.com
157+
test.org
158+
not-a-url
159+
131160
# If the overall pattern matches but capture group 1 does not participate,
132161
# regexp_replace(..., '\1') should substitute the empty string, not keep
133162
# the original input.

0 commit comments

Comments
 (0)