Skip to content

Commit 2f01e19

Browse files
authored
[codex] Fix multiline quoted-field chunk starts (#1184)
* fix(chunking): reject bad chunk starts Require every sampled row to match the expected column count before accepting a multithreaded chunk boundary. Add a regression for issue #1139 covering multiline quoted fields split across chunk boundaries. Fixes #1139 * ci(deps): rerun after WeakRefStrings release * fix(hash): support PointerString on nightly Handle the Julia nightly memhash removal in CSV.PointerString and add a focused basics regression for the seeded hash path. This is the remaining blocker from the fresh nightly CI rerun after WeakRefStrings 1.4.3 was released.
1 parent b2ed663 commit 2f01e19

3 files changed

Lines changed: 35 additions & 3 deletions

File tree

src/detection.jl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,7 @@ function findchunkrowstart(ranges, i, buf, opts, typemap, downcast, ncols, rows_
361361
# now we read the next `rows_to_check` rows and see if we get the roughly the right # of columns
362362
rowstartpos = pos
363363
parsedncols = rowsparsed = 0
364+
matchedncols = true
364365
columnprops = Vector{ColumnProperties}(undef, ncols)
365366
for i = 1:ncols
366367
if origcoltypes[i] === NeedsTypeDetection
@@ -398,6 +399,7 @@ function findchunkrowstart(ranges, i, buf, opts, typemap, downcast, ncols, rows_
398399
end
399400
rowsparsed += ((pos < len) | (numcolsthisrow != 0)) # trailing newline does not count
400401
parsedncols += numcolsthisrow
402+
matchedncols &= numcolsthisrow == ncols
401403
end
402404
parsedncols += addtrailingcolumn
403405
lock(columnlock) do
@@ -414,7 +416,7 @@ function findchunkrowstart(ranges, i, buf, opts, typemap, downcast, ncols, rows_
414416
end
415417
end
416418
f40 = ncols * 0.025
417-
if (ncols - f40) <= (parsedncols / rowsparsed) <= (ncols + f40)
419+
if matchedncols && (ncols - f40) <= (parsedncols / rowsparsed) <= (ncols + f40)
418420
# ok, seems like we figured out the right start for parsing on this chunk
419421
Threads.atomic_add!(totalbytes, Int(pos - rowstartpos))
420422
Threads.atomic_add!(totalrows, rowsparsed)

src/utils.jl

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -579,8 +579,14 @@ struct PointerString
579579
end
580580

581581
function Base.hash(s::PointerString, h::UInt)
582-
h += Base.memhash_seed
583-
ccall(Base.memhash, UInt, (Ptr{UInt8}, Csize_t, UInt32), s.ptr, s.len, h % UInt32) + h
582+
@static if isdefined(Base, :memhash_seed)
583+
h += Base.memhash_seed
584+
return ccall(Base.memhash, UInt, (Ptr{UInt8}, Csize_t, UInt32), s.ptr, s.len, h % UInt32) + h
585+
elseif isdefined(Base, :hash_bytes) && isdefined(Base, :HASH_SECRET)
586+
return Base.hash_bytes(s.ptr, s.len, UInt64(h), Base.HASH_SECRET) % UInt
587+
else
588+
return hash(String(s), h)
589+
end
584590
end
585591

586592
import Base: ==

test/basics.jl

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -456,6 +456,22 @@ f = CSV.File(joinpath(dir, "escape_row_starts.csv"); ntasks=2)
456456
@test eltype(f.col1) == String
457457
@test eltype(f.col2) == Int
458458

459+
mktempdir() do tmp
460+
# 1139: multiline quoted fields should still parse correctly when a chunk
461+
# boundary initially lands inside the quoted field body.
462+
n = 4000
463+
path = joinpath(tmp, "issue1139.csv")
464+
text = "123\nabc"
465+
CSV.write(path, (id=1:n, text=fill(text, n)))
466+
ctx = CSV.Context(path; ntasks=2)
467+
@test ctx.threaded
468+
f = CSV.File(path; ntasks=2)
469+
@test length(f) == n
470+
@test all(==(text), f.text)
471+
@test map(x -> parse(Int, string(x)), f.id[2038:2040]) == [2038, 2039, 2040]
472+
@test f.text[2038:2040] == fill(text, 3)
473+
end
474+
459475
f = CSV.File(IOBuffer("col1\nhey\nthere\nsailor"); stringtype=PosLenString)
460476
@test f.col1 isa PosLenStringVector
461477
@test Tables.columnnames(f) == [:col1]
@@ -467,6 +483,14 @@ f = CSV.File(IOBuffer("col1\nhey\nthere\nsailor"); stringtype=PosLenString)
467483
@test columntable(f) == columntable(collect(f))
468484
show(f)
469485

486+
let str = "hash me", seed = UInt(0x1234)
487+
GC.@preserve str begin
488+
ptrstr = CSV.PointerString(pointer(str), ncodeunits(str))
489+
@test hash(ptrstr, seed) == hash(str, seed)
490+
@test hash(ptrstr) isa UInt
491+
end
492+
end
493+
470494
f = CSV.File(joinpath(dir, "big_types.csv"); stringtype=PosLenString, pool=false)
471495
@test eltype(f.time) == Dates.Time
472496
@test eltype(f.bool) == Bool

0 commit comments

Comments
 (0)