Skip to content

Commit ef0d9f8

Browse files
authored
Improve IT cluster readiness diagnostics (#17903)
* Improve IT cluster readiness diagnostics * Increase pipe IT cluster readiness retries
1 parent ddc6554 commit ef0d9f8

3 files changed

Lines changed: 118 additions & 6 deletions

File tree

.github/workflows/pipe-it.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ jobs:
8585
mvn clean verify \
8686
-P with-integration-tests \
8787
-DskipUTs \
88+
-DintegrationTest.clusterReadyRetryCount=90 \
8889
-DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 -DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
8990
-DClusterConfigurations=${{ matrix.cluster1 }},${{ matrix.cluster2 }} \
9091
-pl integration-test \
@@ -188,6 +189,7 @@ jobs:
188189
mvn clean verify \
189190
-P with-integration-tests \
190191
-DskipUTs \
192+
-DintegrationTest.clusterReadyRetryCount=90 \
191193
-DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 -DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
192194
-DClusterConfigurations=${{ matrix.cluster }},${{ matrix.cluster }} \
193195
-Dfailsafe.includesFile="$RUNNER_TEMP/it-shard.txt" \
@@ -295,6 +297,7 @@ jobs:
295297
mvn clean verify \
296298
-P with-integration-tests \
297299
-DskipUTs \
300+
-DintegrationTest.clusterReadyRetryCount=90 \
298301
-DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 -DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
299302
-DClusterConfigurations=${{ matrix.cluster1 }},${{ matrix.cluster2 }} \
300303
-Dfailsafe.includesFile="$RUNNER_TEMP/it-shard.txt" \
@@ -402,6 +405,7 @@ jobs:
402405
mvn clean verify \
403406
-P with-integration-tests \
404407
-DskipUTs \
408+
-DintegrationTest.clusterReadyRetryCount=90 \
405409
-DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 -DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
406410
-DClusterConfigurations=${{ matrix.cluster1 }},${{ matrix.cluster2 }} \
407411
-Dfailsafe.includesFile="$RUNNER_TEMP/it-shard.txt" \
@@ -491,6 +495,7 @@ jobs:
491495
mvn clean verify \
492496
-P with-integration-tests \
493497
-DskipUTs \
498+
-DintegrationTest.clusterReadyRetryCount=90 \
494499
-DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 -DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
495500
-DClusterConfigurations=${{ matrix.cluster1 }},${{ matrix.cluster2 }} \
496501
-pl integration-test \
@@ -577,6 +582,7 @@ jobs:
577582
mvn clean verify \
578583
-P with-integration-tests \
579584
-DskipUTs \
585+
-DintegrationTest.clusterReadyRetryCount=90 \
580586
-DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 -DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
581587
-DClusterConfigurations=${{ matrix.cluster1 }},${{ matrix.cluster2 }} \
582588
-pl integration-test \
@@ -663,6 +669,7 @@ jobs:
663669
mvn clean verify \
664670
-P with-integration-tests \
665671
-DskipUTs \
672+
-DintegrationTest.clusterReadyRetryCount=90 \
666673
-DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 -DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
667674
-DClusterConfigurations=${{ matrix.cluster1 }},${{ matrix.cluster2 }} \
668675
-pl integration-test \
@@ -749,6 +756,7 @@ jobs:
749756
mvn clean verify \
750757
-P with-integration-tests \
751758
-DskipUTs \
759+
-DintegrationTest.clusterReadyRetryCount=90 \
752760
-DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 -DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
753761
-DClusterConfigurations=${{ matrix.cluster1 }},${{ matrix.cluster2 }} \
754762
-pl integration-test \
@@ -852,6 +860,7 @@ jobs:
852860
mvn clean verify \
853861
-P with-integration-tests \
854862
-DskipUTs \
863+
-DintegrationTest.clusterReadyRetryCount=90 \
855864
-DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 -DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
856865
-DClusterConfigurations=${{ matrix.cluster }},${{ matrix.cluster }} \
857866
-Dfailsafe.includesFile="$RUNNER_TEMP/it-shard.txt" \
@@ -958,6 +967,7 @@ jobs:
958967
mvn clean verify \
959968
-P with-integration-tests \
960969
-DskipUTs \
970+
-DintegrationTest.clusterReadyRetryCount=90 \
961971
-DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 -DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
962972
-DClusterConfigurations=${{ matrix.cluster }},${{ matrix.cluster }} \
963973
-Dfailsafe.includesFile="$RUNNER_TEMP/it-shard.txt" \
@@ -1047,6 +1057,7 @@ jobs:
10471057
mvn clean verify \
10481058
-P with-integration-tests \
10491059
-DskipUTs \
1060+
-DintegrationTest.clusterReadyRetryCount=90 \
10501061
-DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 -DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \
10511062
-DClusterConfigurations=${{ matrix.cluster1 }},${{ matrix.cluster2 }},${{ matrix.cluster3 }} \
10521063
-pl integration-test \

integration-test/src/main/java/org/apache/iotdb/it/env/MultiEnvFactory.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ public static BaseEnv getEnv(final int index) throws IndexOutOfBoundsException {
5151
/** Create several environments according to the specific number. */
5252
public static void createEnv(final int num) {
5353
// Not judge EnvType for individual test convenience
54+
envList.clear();
5455
final long startTime = System.currentTimeMillis();
5556
for (int i = 0; i < num; ++i) {
5657
try {

integration-test/src/main/java/org/apache/iotdb/it/env/cluster/env/AbstractEnv.java

Lines changed: 106 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,9 @@
9696

9797
public abstract class AbstractEnv implements BaseEnv {
9898
private static final Logger logger = IoTDBTestLogger.logger;
99+
private static final int DEFAULT_CLUSTER_READY_RETRY_COUNT = 30;
100+
private static final String CLUSTER_READY_RETRY_COUNT_PROPERTY =
101+
"integrationTest.clusterReadyRetryCount";
99102

100103
private final Random rand = new Random();
101104
protected List<ConfigNodeWrapper> configNodeWrapperList = Collections.emptyList();
@@ -104,7 +107,7 @@ public abstract class AbstractEnv implements BaseEnv {
104107
protected String testMethodName = null;
105108
protected int index = 0;
106109
protected long startTime;
107-
protected int retryCount = 30;
110+
protected int retryCount = getDefaultClusterReadyRetryCount();
108111
private IClientManager<TEndPoint, SyncConfigNodeIServiceClient> clientManager;
109112
private List<String> configNodeKillPoints = new ArrayList<>();
110113
private List<String> dataNodeKillPoints = new ArrayList<>();
@@ -128,6 +131,12 @@ protected AbstractEnv(final long startTime) {
128131
this.clusterConfig = new MppClusterConfig();
129132
}
130133

134+
private static int getDefaultClusterReadyRetryCount() {
135+
final int configuredRetryCount =
136+
Integer.getInteger(CLUSTER_READY_RETRY_COUNT_PROPERTY, DEFAULT_CLUSTER_READY_RETRY_COUNT);
137+
return configuredRetryCount > 0 ? configuredRetryCount : DEFAULT_CLUSTER_READY_RETRY_COUNT;
138+
}
139+
131140
@Override
132141
public ClusterConfig getConfig() {
133142
return clusterConfig;
@@ -401,12 +410,14 @@ public void checkClusterStatus(
401410
processStatusMap.clear();
402411

403412
showClusterResp = client.showCluster();
413+
showClusterStatus = showClusterResp.getStatus();
414+
actualNodeSize = showClusterResp.getNodeStatusSize();
415+
lastNodeStatus = showClusterResp.getNodeStatus();
404416

405417
// Check resp status
406418
if (showClusterResp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) {
407419
passed = false;
408420
showClusterPassed = false;
409-
showClusterStatus = showClusterResp.getStatus();
410421
}
411422

412423
// Check the number of nodes
@@ -416,15 +427,13 @@ public void checkClusterStatus(
416427
+ extraNodeWrappers.size()) {
417428
passed = false;
418429
nodeSizePassed = false;
419-
actualNodeSize = showClusterResp.getNodeStatusSize();
420430
}
421431

422432
// Check the status of nodes
423433
if (passed) {
424434
passed = nodeStatusCheck.test(showClusterResp.getNodeStatus());
425435
if (!passed) {
426436
nodeStatusPassed = false;
427-
lastNodeStatus = showClusterResp.getNodeStatus();
428437
}
429438
}
430439

@@ -510,8 +519,93 @@ public void checkClusterStatus(
510519
}
511520
}
512521

522+
dumpTestJVMSnapshotQuietly("cluster status check failed");
513523
throw new AssertionError(
514-
String.format("After %d times retry, the cluster can't work!", retryCount));
524+
buildClusterStatusFailureMessage(
525+
showClusterPassed,
526+
nodeSizePassed,
527+
nodeStatusPassed,
528+
processStatusPassed,
529+
showClusterStatus,
530+
actualNodeSize,
531+
lastNodeStatus,
532+
processStatusMap,
533+
lastException));
534+
}
535+
536+
private String buildClusterStatusFailureMessage(
537+
final boolean showClusterPassed,
538+
final boolean nodeSizePassed,
539+
final boolean nodeStatusPassed,
540+
final boolean processStatusPassed,
541+
final TSStatus showClusterStatus,
542+
final int actualNodeSize,
543+
final Map<Integer, String> lastNodeStatus,
544+
final Map<AbstractNodeWrapper, Integer> processStatusMap,
545+
final Exception lastException) {
546+
final StringBuilder builder =
547+
new StringBuilder(
548+
String.format("After %d times retry, the cluster status check failed", retryCount));
549+
builder
550+
.append(": showClusterPassed=")
551+
.append(showClusterPassed)
552+
.append(", nodeSizePassed=")
553+
.append(nodeSizePassed)
554+
.append(", nodeStatusPassed=")
555+
.append(nodeStatusPassed)
556+
.append(", processStatusPassed=")
557+
.append(processStatusPassed)
558+
.append(", expectedNodeSize=")
559+
.append(
560+
configNodeWrapperList.size() + dataNodeWrapperList.size() + aiNodeWrapperList.size())
561+
.append(", actualNodeSize=")
562+
.append(actualNodeSize);
563+
if (showClusterStatus != null) {
564+
builder.append(", showClusterStatus=").append(showClusterStatus);
565+
}
566+
if (lastNodeStatus != null) {
567+
builder.append(", lastNodeStatus=").append(lastNodeStatus);
568+
}
569+
if (!processStatusMap.isEmpty()) {
570+
builder.append(", processStatus=").append(formatProcessStatus(processStatusMap));
571+
}
572+
if (lastException != null) {
573+
builder
574+
.append(", lastException=")
575+
.append(lastException.getClass().getName())
576+
.append(": ")
577+
.append(lastException.getMessage());
578+
}
579+
builder.append(", logDirs=").append(getClusterLogDirs());
580+
return builder.toString();
581+
}
582+
583+
private Map<String, Integer> formatProcessStatus(
584+
final Map<AbstractNodeWrapper, Integer> processStatusMap) {
585+
final Map<String, Integer> result = new LinkedHashMap<>();
586+
processStatusMap.forEach(
587+
(nodeWrapper, statusCode) -> result.put(nodeWrapper.getId(), statusCode));
588+
return result;
589+
}
590+
591+
private List<String> getClusterLogDirs() {
592+
final List<AbstractNodeWrapper> allNodeWrappers = new ArrayList<>();
593+
allNodeWrappers.addAll(configNodeWrapperList);
594+
allNodeWrappers.addAll(dataNodeWrapperList);
595+
allNodeWrappers.addAll(aiNodeWrapperList);
596+
return allNodeWrappers.stream()
597+
.map(AbstractNodeWrapper::getLogDirPath)
598+
.distinct()
599+
.collect(Collectors.toList());
600+
}
601+
602+
private void dumpTestJVMSnapshotQuietly(final String reason) {
603+
try {
604+
logger.info("Dumping test JVM snapshots because {}.", reason);
605+
dumpTestJVMSnapshot();
606+
} catch (final Exception e) {
607+
logger.warn("Failed to dump test JVM snapshots after {}", reason, e);
608+
}
515609
}
516610

517611
private void handleProcessStatus(Map<AbstractNodeWrapper, Integer> processStatusMap) {
@@ -956,6 +1050,7 @@ protected void testJDBCConnection() {
9561050
.collect(Collectors.toList());
9571051
final RequestDelegate<Void> testDelegate =
9581052
new ParallelRequestDelegate<>(endpoints, NODE_START_TIMEOUT, this);
1053+
final Map<String, String> lastConnectionFailures = Collections.synchronizedMap(new HashMap<>());
9591054
for (final DataNodeWrapper dataNode : dataNodeWrapperList) {
9601055
final String dataNodeEndpoint = dataNode.getIpAndPortString();
9611056
testDelegate.addRequest(
@@ -974,6 +1069,8 @@ protected void testJDBCConnection() {
9741069
return null;
9751070
} catch (final Exception e) {
9761071
lastException = e;
1072+
lastConnectionFailures.put(
1073+
dataNodeEndpoint, e.getClass().getName() + ": " + e.getMessage());
9771074
TimeUnit.SECONDS.sleep(1L);
9781075
}
9791076
}
@@ -987,8 +1084,11 @@ protected void testJDBCConnection() {
9871084
testDelegate.requestAll();
9881085
} catch (final Exception e) {
9891086
logger.error("exception in test Cluster with RPC, message: {}", e.getMessage(), e);
1087+
dumpTestJVMSnapshotQuietly("JDBC connection check failed");
9901088
throw new AssertionError(
991-
String.format("After %d times retry, the cluster can't work!", retryCount));
1089+
String.format(
1090+
"After %d times retry, JDBC connections to DataNodes are not ready. endpoints=%s, lastConnectionFailures=%s, logDirs=%s",
1091+
retryCount, endpoints, lastConnectionFailures, getClusterLogDirs()));
9921092
}
9931093
}
9941094

0 commit comments

Comments
 (0)