Skip to content

Commit d61be49

Browse files
authored
perf: Optimize NULL handling in find_in_set (#21464)
## Which issue does this PR close? - Closes #21463. ## Rationale for this change `find_in_set` uses `PrimitiveArray::<T>::builder` to construct its results, which means building the internal null buffer in an iterative fashion (via repeated `append_null` calls). It is more efficient to construct the null buffer directly, via `NullBuffer::union` (when multiple arguments might be NULL) or just cloning the input null buffer (when passed a single argument). Benchmarks (ARM64): ``` - find_in_set/string_len_8: 589.0 µs → 529.0 µs (-10.2%) - find_in_set/string_len_32: 736.2 µs → 660.5 µs (-10.3%) - find_in_set/string_len_1024: 7.5 ms → 7.4 ms (-1.3%) - find_in_set/string_view_len_8: 616.0 µs → 579.9 µs (-5.9%) - find_in_set/string_view_len_32: 748.0 µs → 701.9 µs (-6.2%) - find_in_set/string_view_len_1024: 7.6 ms → 7.6 ms (0.0%) - find_in_set_scalar/string_len_8: 76.7 µs → 48.0 µs (-37.4%) - find_in_set_scalar/string_len_32: 76.5 µs → 47.6 µs (-37.8%) - find_in_set_scalar/string_len_1024: 76.2 µs → 48.0 µs (-37.0%) - find_in_set_scalar/string_view_len_8: 81.9 µs → 55.7 µs (-32.0%) - find_in_set_scalar/string_view_len_32: 85.5 µs → 56.8 µs (-33.6%) - find_in_set_scalar/string_view_len_1024: 85.2 µs → 57.4 µs (-32.6%) ``` The change should be an improvement for both scalar and array cases. The relative improvement is larger in the scalar case because the scalar case is doing less work and so NULL handling was a larger fraction of the total runtime. ## What changes are included in this PR? * Optimize NULL handling for both scalar and array arg cases ## Are these changes tested? Yes. ## Are there any user-facing changes? No.
1 parent b46634c commit d61be49

File tree

1 file changed

+52
-47
lines changed

1 file changed

+52
-47
lines changed

datafusion/functions/src/unicode/find_in_set.rs

Lines changed: 52 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@
1818
use std::sync::Arc;
1919

2020
use arrow::array::{
21-
ArrayAccessor, ArrayIter, ArrayRef, ArrowPrimitiveType, AsArray, OffsetSizeTrait,
22-
PrimitiveArray,
21+
ArrayAccessor, ArrayRef, ArrowPrimitiveType, AsArray, OffsetSizeTrait, PrimitiveArray,
2322
};
2423
use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type};
24+
use arrow_buffer::NullBuffer;
2525

2626
use crate::utils::utf8_to_int_type;
2727
use datafusion_common::{
@@ -265,53 +265,55 @@ fn find_in_set_general<'a, T, V>(string_array: V, str_list_array: V) -> Result<A
265265
where
266266
T: ArrowPrimitiveType,
267267
T::Native: OffsetSizeTrait,
268-
V: ArrayAccessor<Item = &'a str>,
268+
V: ArrayAccessor<Item = &'a str> + Copy,
269269
{
270-
let string_iter = ArrayIter::new(string_array);
271-
let str_list_iter = ArrayIter::new(str_list_array);
272-
273-
let mut builder = PrimitiveArray::<T>::builder(string_iter.len());
274-
275-
string_iter
276-
.zip(str_list_iter)
277-
.for_each(
278-
|(string_opt, str_list_opt)| match (string_opt, str_list_opt) {
279-
(Some(string), Some(str_list)) => {
280-
let position = str_list
281-
.split(',')
282-
.position(|s| s == string)
283-
.map_or(0, |idx| idx + 1);
284-
builder.append_value(T::Native::from_usize(position).unwrap());
285-
}
286-
_ => builder.append_null(),
287-
},
288-
);
270+
let len = string_array.len();
271+
let nulls = NullBuffer::union(string_array.nulls(), str_list_array.nulls());
272+
let zero = T::Native::from_usize(0).unwrap();
273+
274+
let values: Vec<T::Native> = (0..len)
275+
.map(|i| {
276+
if nulls.as_ref().is_some_and(|n| n.is_null(i)) {
277+
return zero;
278+
}
279+
let string = string_array.value(i);
280+
let str_list = str_list_array.value(i);
281+
let position = str_list
282+
.split(',')
283+
.position(|s| s == string)
284+
.map_or(0, |idx| idx + 1);
285+
T::Native::from_usize(position).unwrap()
286+
})
287+
.collect();
289288

290-
Ok(Arc::new(builder.finish()) as ArrayRef)
289+
Ok(Arc::new(PrimitiveArray::<T>::new(values.into(), nulls)) as ArrayRef)
291290
}
292291

293292
fn find_in_set_left_literal<'a, T, V>(string: &str, str_list_array: V) -> Result<ArrayRef>
294293
where
295294
T: ArrowPrimitiveType,
296295
T::Native: OffsetSizeTrait,
297-
V: ArrayAccessor<Item = &'a str>,
296+
V: ArrayAccessor<Item = &'a str> + Copy,
298297
{
299-
let mut builder = PrimitiveArray::<T>::builder(str_list_array.len());
300-
301-
let str_list_iter = ArrayIter::new(str_list_array);
302-
303-
str_list_iter.for_each(|str_list_opt| match str_list_opt {
304-
Some(str_list) => {
298+
let len = str_list_array.len();
299+
let nulls = str_list_array.nulls().cloned();
300+
let zero = T::Native::from_usize(0).unwrap();
301+
302+
let values: Vec<T::Native> = (0..len)
303+
.map(|i| {
304+
if nulls.as_ref().is_some_and(|n| n.is_null(i)) {
305+
return zero;
306+
}
307+
let str_list = str_list_array.value(i);
305308
let position = str_list
306309
.split(',')
307310
.position(|s| s == string)
308311
.map_or(0, |idx| idx + 1);
309-
builder.append_value(T::Native::from_usize(position).unwrap());
310-
}
311-
None => builder.append_null(),
312-
});
312+
T::Native::from_usize(position).unwrap()
313+
})
314+
.collect();
313315

314-
Ok(Arc::new(builder.finish()) as ArrayRef)
316+
Ok(Arc::new(PrimitiveArray::<T>::new(values.into(), nulls)) as ArrayRef)
315317
}
316318

317319
fn find_in_set_right_literal<'a, T, V>(
@@ -321,24 +323,27 @@ fn find_in_set_right_literal<'a, T, V>(
321323
where
322324
T: ArrowPrimitiveType,
323325
T::Native: OffsetSizeTrait,
324-
V: ArrayAccessor<Item = &'a str>,
326+
V: ArrayAccessor<Item = &'a str> + Copy,
325327
{
326-
let mut builder = PrimitiveArray::<T>::builder(string_array.len());
327-
328-
let string_iter = ArrayIter::new(string_array);
329-
330-
string_iter.for_each(|string_opt| match string_opt {
331-
Some(string) => {
328+
let len = string_array.len();
329+
let nulls = string_array.nulls().cloned();
330+
let zero = T::Native::from_usize(0).unwrap();
331+
332+
let values: Vec<T::Native> = (0..len)
333+
.map(|i| {
334+
if nulls.as_ref().is_some_and(|n| n.is_null(i)) {
335+
return zero;
336+
}
337+
let string = string_array.value(i);
332338
let position = str_list
333339
.iter()
334340
.position(|s| *s == string)
335341
.map_or(0, |idx| idx + 1);
336-
builder.append_value(T::Native::from_usize(position).unwrap());
337-
}
338-
None => builder.append_null(),
339-
});
342+
T::Native::from_usize(position).unwrap()
343+
})
344+
.collect();
340345

341-
Ok(Arc::new(builder.finish()) as ArrayRef)
346+
Ok(Arc::new(PrimitiveArray::<T>::new(values.into(), nulls)) as ArrayRef)
342347
}
343348

344349
#[cfg(test)]

0 commit comments

Comments
 (0)