9696
9797public abstract class AbstractEnv implements BaseEnv {
9898 private static final Logger logger = IoTDBTestLogger .logger ;
99+ private static final int DEFAULT_CLUSTER_READY_RETRY_COUNT = 30 ;
100+ private static final String CLUSTER_READY_RETRY_COUNT_PROPERTY =
101+ "integrationTest.clusterReadyRetryCount" ;
99102
100103 private final Random rand = new Random ();
101104 protected List <ConfigNodeWrapper > configNodeWrapperList = Collections .emptyList ();
@@ -104,7 +107,7 @@ public abstract class AbstractEnv implements BaseEnv {
104107 protected String testMethodName = null ;
105108 protected int index = 0 ;
106109 protected long startTime ;
107- protected int retryCount = 30 ;
110+ protected int retryCount = getDefaultClusterReadyRetryCount () ;
108111 private IClientManager <TEndPoint , SyncConfigNodeIServiceClient > clientManager ;
109112 private List <String > configNodeKillPoints = new ArrayList <>();
110113 private List <String > dataNodeKillPoints = new ArrayList <>();
@@ -128,6 +131,12 @@ protected AbstractEnv(final long startTime) {
128131 this .clusterConfig = new MppClusterConfig ();
129132 }
130133
134+ private static int getDefaultClusterReadyRetryCount () {
135+ final int configuredRetryCount =
136+ Integer .getInteger (CLUSTER_READY_RETRY_COUNT_PROPERTY , DEFAULT_CLUSTER_READY_RETRY_COUNT );
137+ return configuredRetryCount > 0 ? configuredRetryCount : DEFAULT_CLUSTER_READY_RETRY_COUNT ;
138+ }
139+
131140 @ Override
132141 public ClusterConfig getConfig () {
133142 return clusterConfig ;
@@ -401,12 +410,14 @@ public void checkClusterStatus(
401410 processStatusMap .clear ();
402411
403412 showClusterResp = client .showCluster ();
413+ showClusterStatus = showClusterResp .getStatus ();
414+ actualNodeSize = showClusterResp .getNodeStatusSize ();
415+ lastNodeStatus = showClusterResp .getNodeStatus ();
404416
405417 // Check resp status
406418 if (showClusterResp .getStatus ().getCode () != TSStatusCode .SUCCESS_STATUS .getStatusCode ()) {
407419 passed = false ;
408420 showClusterPassed = false ;
409- showClusterStatus = showClusterResp .getStatus ();
410421 }
411422
412423 // Check the number of nodes
@@ -416,15 +427,13 @@ public void checkClusterStatus(
416427 + extraNodeWrappers .size ()) {
417428 passed = false ;
418429 nodeSizePassed = false ;
419- actualNodeSize = showClusterResp .getNodeStatusSize ();
420430 }
421431
422432 // Check the status of nodes
423433 if (passed ) {
424434 passed = nodeStatusCheck .test (showClusterResp .getNodeStatus ());
425435 if (!passed ) {
426436 nodeStatusPassed = false ;
427- lastNodeStatus = showClusterResp .getNodeStatus ();
428437 }
429438 }
430439
@@ -510,8 +519,93 @@ public void checkClusterStatus(
510519 }
511520 }
512521
522+ dumpTestJVMSnapshotQuietly ("cluster status check failed" );
513523 throw new AssertionError (
514- String .format ("After %d times retry, the cluster can't work!" , retryCount ));
524+ buildClusterStatusFailureMessage (
525+ showClusterPassed ,
526+ nodeSizePassed ,
527+ nodeStatusPassed ,
528+ processStatusPassed ,
529+ showClusterStatus ,
530+ actualNodeSize ,
531+ lastNodeStatus ,
532+ processStatusMap ,
533+ lastException ));
534+ }
535+
536+ private String buildClusterStatusFailureMessage (
537+ final boolean showClusterPassed ,
538+ final boolean nodeSizePassed ,
539+ final boolean nodeStatusPassed ,
540+ final boolean processStatusPassed ,
541+ final TSStatus showClusterStatus ,
542+ final int actualNodeSize ,
543+ final Map <Integer , String > lastNodeStatus ,
544+ final Map <AbstractNodeWrapper , Integer > processStatusMap ,
545+ final Exception lastException ) {
546+ final StringBuilder builder =
547+ new StringBuilder (
548+ String .format ("After %d times retry, the cluster status check failed" , retryCount ));
549+ builder
550+ .append (": showClusterPassed=" )
551+ .append (showClusterPassed )
552+ .append (", nodeSizePassed=" )
553+ .append (nodeSizePassed )
554+ .append (", nodeStatusPassed=" )
555+ .append (nodeStatusPassed )
556+ .append (", processStatusPassed=" )
557+ .append (processStatusPassed )
558+ .append (", expectedNodeSize=" )
559+ .append (
560+ configNodeWrapperList .size () + dataNodeWrapperList .size () + aiNodeWrapperList .size ())
561+ .append (", actualNodeSize=" )
562+ .append (actualNodeSize );
563+ if (showClusterStatus != null ) {
564+ builder .append (", showClusterStatus=" ).append (showClusterStatus );
565+ }
566+ if (lastNodeStatus != null ) {
567+ builder .append (", lastNodeStatus=" ).append (lastNodeStatus );
568+ }
569+ if (!processStatusMap .isEmpty ()) {
570+ builder .append (", processStatus=" ).append (formatProcessStatus (processStatusMap ));
571+ }
572+ if (lastException != null ) {
573+ builder
574+ .append (", lastException=" )
575+ .append (lastException .getClass ().getName ())
576+ .append (": " )
577+ .append (lastException .getMessage ());
578+ }
579+ builder .append (", logDirs=" ).append (getClusterLogDirs ());
580+ return builder .toString ();
581+ }
582+
583+ private Map <String , Integer > formatProcessStatus (
584+ final Map <AbstractNodeWrapper , Integer > processStatusMap ) {
585+ final Map <String , Integer > result = new LinkedHashMap <>();
586+ processStatusMap .forEach (
587+ (nodeWrapper , statusCode ) -> result .put (nodeWrapper .getId (), statusCode ));
588+ return result ;
589+ }
590+
591+ private List <String > getClusterLogDirs () {
592+ final List <AbstractNodeWrapper > allNodeWrappers = new ArrayList <>();
593+ allNodeWrappers .addAll (configNodeWrapperList );
594+ allNodeWrappers .addAll (dataNodeWrapperList );
595+ allNodeWrappers .addAll (aiNodeWrapperList );
596+ return allNodeWrappers .stream ()
597+ .map (AbstractNodeWrapper ::getLogDirPath )
598+ .distinct ()
599+ .collect (Collectors .toList ());
600+ }
601+
602+ private void dumpTestJVMSnapshotQuietly (final String reason ) {
603+ try {
604+ logger .info ("Dumping test JVM snapshots because {}." , reason );
605+ dumpTestJVMSnapshot ();
606+ } catch (final Exception e ) {
607+ logger .warn ("Failed to dump test JVM snapshots after {}" , reason , e );
608+ }
515609 }
516610
517611 private void handleProcessStatus (Map <AbstractNodeWrapper , Integer > processStatusMap ) {
@@ -956,6 +1050,7 @@ protected void testJDBCConnection() {
9561050 .collect (Collectors .toList ());
9571051 final RequestDelegate <Void > testDelegate =
9581052 new ParallelRequestDelegate <>(endpoints , NODE_START_TIMEOUT , this );
1053+ final Map <String , String > lastConnectionFailures = Collections .synchronizedMap (new HashMap <>());
9591054 for (final DataNodeWrapper dataNode : dataNodeWrapperList ) {
9601055 final String dataNodeEndpoint = dataNode .getIpAndPortString ();
9611056 testDelegate .addRequest (
@@ -974,6 +1069,8 @@ protected void testJDBCConnection() {
9741069 return null ;
9751070 } catch (final Exception e ) {
9761071 lastException = e ;
1072+ lastConnectionFailures .put (
1073+ dataNodeEndpoint , e .getClass ().getName () + ": " + e .getMessage ());
9771074 TimeUnit .SECONDS .sleep (1L );
9781075 }
9791076 }
@@ -987,8 +1084,11 @@ protected void testJDBCConnection() {
9871084 testDelegate .requestAll ();
9881085 } catch (final Exception e ) {
9891086 logger .error ("exception in test Cluster with RPC, message: {}" , e .getMessage (), e );
1087+ dumpTestJVMSnapshotQuietly ("JDBC connection check failed" );
9901088 throw new AssertionError (
991- String .format ("After %d times retry, the cluster can't work!" , retryCount ));
1089+ String .format (
1090+ "After %d times retry, JDBC connections to DataNodes are not ready. endpoints=%s, lastConnectionFailures=%s, logDirs=%s" ,
1091+ retryCount , endpoints , lastConnectionFailures , getClusterLogDirs ()));
9921092 }
9931093 }
9941094
0 commit comments