diff --git a/.github/workflows/pipe-it.yml b/.github/workflows/pipe-it.yml index 0a4e5e54edabf..2a9fbbd12398b 100644 --- a/.github/workflows/pipe-it.yml +++ b/.github/workflows/pipe-it.yml @@ -85,6 +85,7 @@ jobs: mvn clean verify \ -P with-integration-tests \ -DskipUTs \ + -DintegrationTest.clusterReadyRetryCount=90 \ -DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 -DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \ -DClusterConfigurations=${{ matrix.cluster1 }},${{ matrix.cluster2 }} \ -pl integration-test \ @@ -188,6 +189,7 @@ jobs: mvn clean verify \ -P with-integration-tests \ -DskipUTs \ + -DintegrationTest.clusterReadyRetryCount=90 \ -DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 -DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \ -DClusterConfigurations=${{ matrix.cluster }},${{ matrix.cluster }} \ -Dfailsafe.includesFile="$RUNNER_TEMP/it-shard.txt" \ @@ -295,6 +297,7 @@ jobs: mvn clean verify \ -P with-integration-tests \ -DskipUTs \ + -DintegrationTest.clusterReadyRetryCount=90 \ -DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 -DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \ -DClusterConfigurations=${{ matrix.cluster1 }},${{ matrix.cluster2 }} \ -Dfailsafe.includesFile="$RUNNER_TEMP/it-shard.txt" \ @@ -402,6 +405,7 @@ jobs: mvn clean verify \ -P with-integration-tests \ -DskipUTs \ + -DintegrationTest.clusterReadyRetryCount=90 \ -DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 -DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \ -DClusterConfigurations=${{ matrix.cluster1 }},${{ matrix.cluster2 }} \ -Dfailsafe.includesFile="$RUNNER_TEMP/it-shard.txt" \ @@ -491,6 +495,7 @@ jobs: mvn clean verify \ -P with-integration-tests \ -DskipUTs \ + -DintegrationTest.clusterReadyRetryCount=90 \ -DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 -DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \ -DClusterConfigurations=${{ matrix.cluster1 }},${{ matrix.cluster2 }} \ -pl integration-test \ @@ -577,6 +582,7 @@ jobs: mvn clean verify \ -P with-integration-tests \ -DskipUTs \ + -DintegrationTest.clusterReadyRetryCount=90 \ -DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 -DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \ -DClusterConfigurations=${{ matrix.cluster1 }},${{ matrix.cluster2 }} \ -pl integration-test \ @@ -663,6 +669,7 @@ jobs: mvn clean verify \ -P with-integration-tests \ -DskipUTs \ + -DintegrationTest.clusterReadyRetryCount=90 \ -DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 -DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \ -DClusterConfigurations=${{ matrix.cluster1 }},${{ matrix.cluster2 }} \ -pl integration-test \ @@ -749,6 +756,7 @@ jobs: mvn clean verify \ -P with-integration-tests \ -DskipUTs \ + -DintegrationTest.clusterReadyRetryCount=90 \ -DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 -DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \ -DClusterConfigurations=${{ matrix.cluster1 }},${{ matrix.cluster2 }} \ -pl integration-test \ @@ -852,6 +860,7 @@ jobs: mvn clean verify \ -P with-integration-tests \ -DskipUTs \ + -DintegrationTest.clusterReadyRetryCount=90 \ -DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 -DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \ -DClusterConfigurations=${{ matrix.cluster }},${{ matrix.cluster }} \ -Dfailsafe.includesFile="$RUNNER_TEMP/it-shard.txt" \ @@ -958,6 +967,7 @@ jobs: mvn clean verify \ -P with-integration-tests \ -DskipUTs \ + -DintegrationTest.clusterReadyRetryCount=90 \ -DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 -DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \ -DClusterConfigurations=${{ matrix.cluster }},${{ matrix.cluster }} \ -Dfailsafe.includesFile="$RUNNER_TEMP/it-shard.txt" \ @@ -1047,6 +1057,7 @@ jobs: mvn clean verify \ -P with-integration-tests \ -DskipUTs \ + -DintegrationTest.clusterReadyRetryCount=90 \ -DintegrationTest.forkCount=1 -DConfigNodeMaxHeapSize=256 -DDataNodeMaxHeapSize=1024 -DDataNodeMaxDirectMemorySize=768 \ -DClusterConfigurations=${{ matrix.cluster1 }},${{ matrix.cluster2 }},${{ matrix.cluster3 }} \ -pl integration-test \ diff --git a/integration-test/src/main/java/org/apache/iotdb/it/env/MultiEnvFactory.java b/integration-test/src/main/java/org/apache/iotdb/it/env/MultiEnvFactory.java index 5832f1c485bc5..f2e5f9c1f903d 100644 --- a/integration-test/src/main/java/org/apache/iotdb/it/env/MultiEnvFactory.java +++ b/integration-test/src/main/java/org/apache/iotdb/it/env/MultiEnvFactory.java @@ -51,6 +51,7 @@ public static BaseEnv getEnv(final int index) throws IndexOutOfBoundsException { /** Create several environments according to the specific number. */ public static void createEnv(final int num) { // Not judge EnvType for individual test convenience + envList.clear(); final long startTime = System.currentTimeMillis(); for (int i = 0; i < num; ++i) { try { diff --git a/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/env/AbstractEnv.java b/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/env/AbstractEnv.java index 0e7eefb3a4188..b8913b1b69192 100644 --- a/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/env/AbstractEnv.java +++ b/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/env/AbstractEnv.java @@ -97,6 +97,9 @@ public abstract class AbstractEnv implements BaseEnv { private static final Logger logger = IoTDBTestLogger.logger; + private static final int DEFAULT_CLUSTER_READY_RETRY_COUNT = 30; + private static final String CLUSTER_READY_RETRY_COUNT_PROPERTY = + "integrationTest.clusterReadyRetryCount"; private final Random rand = new Random(); protected List configNodeWrapperList = Collections.emptyList(); @@ -105,7 +108,7 @@ public abstract class AbstractEnv implements BaseEnv { protected String testMethodName = null; protected int index = 0; protected long startTime; - protected int retryCount = 30; + protected int retryCount = getDefaultClusterReadyRetryCount(); private IClientManager clientManager; private List configNodeKillPoints = new ArrayList<>(); private List dataNodeKillPoints = new ArrayList<>(); @@ -128,6 +131,12 @@ protected AbstractEnv(final long startTime) { this.clusterConfig = new MppClusterConfig(); } + private static int getDefaultClusterReadyRetryCount() { + final int configuredRetryCount = + Integer.getInteger(CLUSTER_READY_RETRY_COUNT_PROPERTY, DEFAULT_CLUSTER_READY_RETRY_COUNT); + return configuredRetryCount > 0 ? configuredRetryCount : DEFAULT_CLUSTER_READY_RETRY_COUNT; + } + @Override public ClusterConfig getConfig() { return clusterConfig; @@ -421,12 +430,14 @@ public void checkClusterStatus( processStatusMap.clear(); showClusterResp = client.showCluster(); + showClusterStatus = showClusterResp.getStatus(); + actualNodeSize = showClusterResp.getNodeStatusSize(); + lastNodeStatus = showClusterResp.getNodeStatus(); // Check resp status if (showClusterResp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { passed = false; showClusterPassed = false; - showClusterStatus = showClusterResp.getStatus(); } // Check the number of nodes @@ -436,7 +447,6 @@ public void checkClusterStatus( + aiNodeWrapperList.size()) { passed = false; nodeSizePassed = false; - actualNodeSize = showClusterResp.getNodeStatusSize(); } // Check the status of nodes @@ -444,7 +454,6 @@ public void checkClusterStatus( passed = nodeStatusCheck.test(showClusterResp.getNodeStatus()); if (!passed) { nodeStatusPassed = false; - lastNodeStatus = showClusterResp.getNodeStatus(); } } @@ -530,8 +539,93 @@ public void checkClusterStatus( } } + dumpTestJVMSnapshotQuietly("cluster status check failed"); throw new AssertionError( - String.format("After %d times retry, the cluster can't work!", retryCount)); + buildClusterStatusFailureMessage( + showClusterPassed, + nodeSizePassed, + nodeStatusPassed, + processStatusPassed, + showClusterStatus, + actualNodeSize, + lastNodeStatus, + processStatusMap, + lastException)); + } + + private String buildClusterStatusFailureMessage( + final boolean showClusterPassed, + final boolean nodeSizePassed, + final boolean nodeStatusPassed, + final boolean processStatusPassed, + final TSStatus showClusterStatus, + final int actualNodeSize, + final Map lastNodeStatus, + final Map processStatusMap, + final Exception lastException) { + final StringBuilder builder = + new StringBuilder( + String.format("After %d times retry, the cluster status check failed", retryCount)); + builder + .append(": showClusterPassed=") + .append(showClusterPassed) + .append(", nodeSizePassed=") + .append(nodeSizePassed) + .append(", nodeStatusPassed=") + .append(nodeStatusPassed) + .append(", processStatusPassed=") + .append(processStatusPassed) + .append(", expectedNodeSize=") + .append( + configNodeWrapperList.size() + dataNodeWrapperList.size() + aiNodeWrapperList.size()) + .append(", actualNodeSize=") + .append(actualNodeSize); + if (showClusterStatus != null) { + builder.append(", showClusterStatus=").append(showClusterStatus); + } + if (lastNodeStatus != null) { + builder.append(", lastNodeStatus=").append(lastNodeStatus); + } + if (!processStatusMap.isEmpty()) { + builder.append(", processStatus=").append(formatProcessStatus(processStatusMap)); + } + if (lastException != null) { + builder + .append(", lastException=") + .append(lastException.getClass().getName()) + .append(": ") + .append(lastException.getMessage()); + } + builder.append(", logDirs=").append(getClusterLogDirs()); + return builder.toString(); + } + + private Map formatProcessStatus( + final Map processStatusMap) { + final Map result = new LinkedHashMap<>(); + processStatusMap.forEach( + (nodeWrapper, statusCode) -> result.put(nodeWrapper.getId(), statusCode)); + return result; + } + + private List getClusterLogDirs() { + final List allNodeWrappers = new ArrayList<>(); + allNodeWrappers.addAll(configNodeWrapperList); + allNodeWrappers.addAll(dataNodeWrapperList); + allNodeWrappers.addAll(aiNodeWrapperList); + return allNodeWrappers.stream() + .map(AbstractNodeWrapper::getLogDirPath) + .distinct() + .collect(Collectors.toList()); + } + + private void dumpTestJVMSnapshotQuietly(final String reason) { + try { + logger.info("Dumping test JVM snapshots because {}.", reason); + dumpTestJVMSnapshot(); + } catch (final Exception e) { + logger.warn("Failed to dump test JVM snapshots after {}", reason, e); + } } private void handleProcessStatus(Map processStatusMap) { @@ -976,6 +1070,7 @@ protected void testJDBCConnection() { .collect(Collectors.toList()); final RequestDelegate testDelegate = new ParallelRequestDelegate<>(endpoints, NODE_START_TIMEOUT, this); + final Map lastConnectionFailures = Collections.synchronizedMap(new HashMap<>()); for (final DataNodeWrapper dataNode : dataNodeWrapperList) { final String dataNodeEndpoint = dataNode.getIpAndPortString(); testDelegate.addRequest( @@ -994,6 +1089,8 @@ protected void testJDBCConnection() { return null; } catch (final Exception e) { lastException = e; + lastConnectionFailures.put( + dataNodeEndpoint, e.getClass().getName() + ": " + e.getMessage()); TimeUnit.SECONDS.sleep(1L); } } @@ -1007,8 +1104,11 @@ protected void testJDBCConnection() { testDelegate.requestAll(); } catch (final Exception e) { logger.error("exception in test Cluster with RPC, message: {}", e.getMessage(), e); + dumpTestJVMSnapshotQuietly("JDBC connection check failed"); throw new AssertionError( - String.format("After %d times retry, the cluster can't work!", retryCount)); + String.format( + "After %d times retry, JDBC connections to DataNodes are not ready. endpoints=%s, lastConnectionFailures=%s, logDirs=%s", + retryCount, endpoints, lastConnectionFailures, getClusterLogDirs())); } }