[KYUUBI #7422] [KSHC] Fix FileWriterFactory using the same TaskAttemptId for different task attempts

maomaodev · pan3793 · commit a99d6e03a37c · 2026-04-27T10:52:52.000+08:00
### Why are the changes needed? Port SPARK-48484 to KSHC. Fix #7421. In the KSHC, `FileWriterFactory` is forked from Spark's `org.apache.spark.sql.execution.datasources.v2.FileWriterFactory`. However, it still contains a bug later fixed on the Spark side by apache/spark#46811. This PR ports that upstream fix to KSHC. ### How was this patch tested? UT ### Was this patch authored or co-authored using generative AI tooling? NO Closes #7422 from maomaodev/kyuubi-7421. Closes #7422 1272f87 [lifumao] [KSHC] Fix FileWriterFactory using the same TaskAttemptId for different task attempts Authored-by: lifumao <lifumao@tencent.com> Signed-off-by: Cheng Pan <chengpan@apache.org>
diff --git a/extensions/spark/kyuubi-spark-connector-hive/pom.xml b/extensions/spark/kyuubi-spark-connector-hive/pom.xml
@@ -69,6 +69,12 @@
             <scope>test</scope>
         </dependency>
 
+        <dependency>
+            <groupId>org.scalatestplus</groupId>
+            <artifactId>mockito-4-11_${scala.binary.version}</artifactId>
+            <scope>test</scope>
+        </dependency>
+
         <dependency>
             <groupId>org.apache.spark</groupId>
             <artifactId>spark-core_${scala.binary.version}</artifactId>
diff --git a/extensions/spark/kyuubi-spark-connector-hive/src/main/scala/org/apache/kyuubi/spark/connector/hive/write/FileWriterFactory.scala b/extensions/spark/kyuubi-spark-connector-hive/src/main/scala/org/apache/kyuubi/spark/connector/hive/write/FileWriterFactory.scala
@@ -43,7 +43,7 @@ case class FileWriterFactory(
   @transient private lazy val jobId = createJobID(jobTrackerID, 0)
 
   override def createWriter(partitionId: Int, realTaskId: Long): DataWriter[InternalRow] = {
-    val taskAttemptContext = createTaskAttemptContext(partitionId)
+    val taskAttemptContext = createTaskAttemptContext(partitionId, realTaskId.toInt & Int.MaxValue)
     committer.setupTask(taskAttemptContext)
     if (description.partitionColumns.isEmpty) {
       new SingleDirectoryDataWriter(description, taskAttemptContext, committer)
@@ -52,9 +52,11 @@ case class FileWriterFactory(
     }
   }
 
-  private def createTaskAttemptContext(partitionId: Int): TaskAttemptContextImpl = {
+  private def createTaskAttemptContext(
+      partitionId: Int,
+      realTaskId: Int): TaskAttemptContextImpl = {
     val taskId = new TaskID(jobId, TaskType.MAP, partitionId)
-    val taskAttemptId = new TaskAttemptID(taskId, 0)
+    val taskAttemptId = new TaskAttemptID(taskId, realTaskId)
     // Set up the configuration object
     val hadoopConf = description.serializableHadoopConf.value
     hadoopConf.set("mapreduce.job.id", jobId.toString)
diff --git a/extensions/spark/kyuubi-spark-connector-hive/src/test/scala/org/apache/kyuubi/spark/connector/hive/write/FileWriterFactorySuite.scala b/extensions/spark/kyuubi-spark-connector-hive/src/test/scala/org/apache/kyuubi/spark/connector/hive/write/FileWriterFactorySuite.scala
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.kyuubi.spark.connector.hive.write
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.internal.io.FileCommitProtocol
+import org.apache.spark.sql.execution.datasources.WriteJobDescription
+import org.apache.spark.util.SerializableConfiguration
+import org.mockito.Mockito._
+import org.scalatest.PrivateMethodTester
+
+class FileWriterFactorySuite extends SparkFunSuite with PrivateMethodTester {
+
+  test("V2Write uses different TaskAttemptIds for different task attempts") {
+    val jobDescription = mock(classOf[WriteJobDescription])
+    when(jobDescription.serializableHadoopConf).thenReturn(
+      new SerializableConfiguration(new Configuration(false)))
+    val committer = mock(classOf[FileCommitProtocol])
+
+    val writerFactory = FileWriterFactory(jobDescription, committer)
+    val createTaskAttemptContext =
+      PrivateMethod[TaskAttemptContextImpl](Symbol("createTaskAttemptContext"))
+
+    val attemptContext =
+      writerFactory.invokePrivate(createTaskAttemptContext(0, 1))
+    val attemptContext1 =
+      writerFactory.invokePrivate(createTaskAttemptContext(0, 2))
+    assert(attemptContext.getTaskAttemptID.getTaskID == attemptContext1.getTaskAttemptID.getTaskID)
+    assert(attemptContext.getTaskAttemptID.getId != attemptContext1.getTaskAttemptID.getId)
+  }
+}