[codex] Fix multiline quoted-field chunk starts (#1184)

quinnj · web-flow · commit 2f01e1989192 · 2026-04-23T13:58:28.000-06:00
* fix(chunking): reject bad chunk starts Require every sampled row to match the expected column count before accepting a multithreaded chunk boundary. Add a regression for issue #1139 covering multiline quoted fields split across chunk boundaries. Fixes #1139 * ci(deps): rerun after WeakRefStrings release * fix(hash): support PointerString on nightly Handle the Julia nightly memhash removal in CSV.PointerString and add a focused basics regression for the seeded hash path. This is the remaining blocker from the fresh nightly CI rerun after WeakRefStrings 1.4.3 was released.
diff --git a/src/detection.jl b/src/detection.jl
@@ -361,6 +361,7 @@ function findchunkrowstart(ranges, i, buf, opts, typemap, downcast, ncols, rows_
         # now we read the next `rows_to_check` rows and see if we get the roughly the right # of columns
         rowstartpos = pos
         parsedncols = rowsparsed = 0
+        matchedncols = true
         columnprops = Vector{ColumnProperties}(undef, ncols)
         for i = 1:ncols
             if origcoltypes[i] === NeedsTypeDetection
@@ -398,6 +399,7 @@ function findchunkrowstart(ranges, i, buf, opts, typemap, downcast, ncols, rows_
             end
             rowsparsed += ((pos < len) | (numcolsthisrow != 0)) # trailing newline does not count
             parsedncols += numcolsthisrow
+            matchedncols &= numcolsthisrow == ncols
         end
         parsedncols += addtrailingcolumn
         lock(columnlock) do
@@ -414,7 +416,7 @@ function findchunkrowstart(ranges, i, buf, opts, typemap, downcast, ncols, rows_
             end
         end
         f40 = ncols * 0.025
-        if (ncols - f40) <= (parsedncols / rowsparsed) <= (ncols + f40)
+        if matchedncols && (ncols - f40) <= (parsedncols / rowsparsed) <= (ncols + f40)
             # ok, seems like we figured out the right start for parsing on this chunk
             Threads.atomic_add!(totalbytes, Int(pos - rowstartpos))
             Threads.atomic_add!(totalrows, rowsparsed)
diff --git a/src/utils.jl b/src/utils.jl
@@ -579,8 +579,14 @@ struct PointerString
 end
 
 function Base.hash(s::PointerString, h::UInt)
-    h += Base.memhash_seed
-    ccall(Base.memhash, UInt, (Ptr{UInt8}, Csize_t, UInt32), s.ptr, s.len, h % UInt32) + h
+    @static if isdefined(Base, :memhash_seed)
+        h += Base.memhash_seed
+        return ccall(Base.memhash, UInt, (Ptr{UInt8}, Csize_t, UInt32), s.ptr, s.len, h % UInt32) + h
+    elseif isdefined(Base, :hash_bytes) && isdefined(Base, :HASH_SECRET)
+        return Base.hash_bytes(s.ptr, s.len, UInt64(h), Base.HASH_SECRET) % UInt
+    else
+        return hash(String(s), h)
+    end
 end
 
 import Base: ==
diff --git a/test/basics.jl b/test/basics.jl
@@ -456,6 +456,22 @@ f = CSV.File(joinpath(dir, "escape_row_starts.csv"); ntasks=2)
 @test eltype(f.col1) == String
 @test eltype(f.col2) == Int
 
+mktempdir() do tmp
+    # 1139: multiline quoted fields should still parse correctly when a chunk
+    # boundary initially lands inside the quoted field body.
+    n = 4000
+    path = joinpath(tmp, "issue1139.csv")
+    text = "123\nabc"
+    CSV.write(path, (id=1:n, text=fill(text, n)))
+    ctx = CSV.Context(path; ntasks=2)
+    @test ctx.threaded
+    f = CSV.File(path; ntasks=2)
+    @test length(f) == n
+    @test all(==(text), f.text)
+    @test map(x -> parse(Int, string(x)), f.id[2038:2040]) == [2038, 2039, 2040]
+    @test f.text[2038:2040] == fill(text, 3)
+end
+
 f = CSV.File(IOBuffer("col1\nhey\nthere\nsailor"); stringtype=PosLenString)
 @test f.col1 isa PosLenStringVector
 @test Tables.columnnames(f) == [:col1]
@@ -467,6 +483,14 @@ f = CSV.File(IOBuffer("col1\nhey\nthere\nsailor"); stringtype=PosLenString)
 @test columntable(f) == columntable(collect(f))
 show(f)
 
+let str = "hash me", seed = UInt(0x1234)
+    GC.@preserve str begin
+        ptrstr = CSV.PointerString(pointer(str), ncodeunits(str))
+        @test hash(ptrstr, seed) == hash(str, seed)
+        @test hash(ptrstr) isa UInt
+    end
+end
+
 f = CSV.File(joinpath(dir, "big_types.csv"); stringtype=PosLenString, pool=false)
 @test eltype(f.time) == Dates.Time
 @test eltype(f.bool) == Bool