diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/statemachine/ConfigRegionStateMachine.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/statemachine/ConfigRegionStateMachine.java index fe687d17556f7..ce8f77f65ed33 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/statemachine/ConfigRegionStateMachine.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/statemachine/ConfigRegionStateMachine.java @@ -29,6 +29,7 @@ import org.apache.iotdb.commons.consensus.ConsensusGroupId; import org.apache.iotdb.commons.file.SystemFileFactory; import org.apache.iotdb.commons.request.IConsensusRequest; +import org.apache.iotdb.commons.utils.LogThrottler; import org.apache.iotdb.confignode.conf.ConfigNodeConfig; import org.apache.iotdb.confignode.conf.ConfigNodeDescriptor; import org.apache.iotdb.confignode.consensus.request.ConfigPhysicalPlan; @@ -81,6 +82,8 @@ public class ConfigRegionStateMachine implements IStateMachine, IStateMachine.Ev /** Variables for {@link ConfigNode} Simple Consensus. */ private LogWriter simpleLogWriter; + private final LogThrottler simpleConsensusWalFlushFailureLogThrottler = new LogThrottler(); + private File simpleLogFile; private int startIndex; private int endIndex; @@ -470,9 +473,14 @@ private void flushWALForSimpleConsensus() { if (simpleLogWriter != null) { try { simpleLogWriter.force(); + simpleConsensusWalFlushFailureLogThrottler.reset(); } catch (IOException e) { - LOGGER.error( - ConfigNodeMessages.CAN_T_FORCE_LOGWRITER_FOR_CONFIGNODE_FLUSHWALFORSIMPLECONSENSUS, e); + if (simpleConsensusWalFlushFailureLogThrottler.shouldLog( + LogThrottler.getFailureSignature(e))) { + LOGGER.error( + ConfigNodeMessages.CAN_T_FORCE_LOGWRITER_FOR_CONFIGNODE_FLUSHWALFORSIMPLECONSENSUS, + e); + } } } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/pipe/coordinator/runtime/PipeMetaSyncer.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/pipe/coordinator/runtime/PipeMetaSyncer.java index 941d6d6d06cec..cc95e6e451bee 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/pipe/coordinator/runtime/PipeMetaSyncer.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/pipe/coordinator/runtime/PipeMetaSyncer.java @@ -24,6 +24,7 @@ import org.apache.iotdb.commons.concurrent.ThreadName; import org.apache.iotdb.commons.concurrent.threadpool.ScheduledExecutorUtil; import org.apache.iotdb.commons.pipe.config.PipeConfig; +import org.apache.iotdb.commons.utils.LogThrottler; import org.apache.iotdb.confignode.i18n.ManagerMessages; import org.apache.iotdb.confignode.manager.ConfigManager; import org.apache.iotdb.confignode.manager.ProcedureManager; @@ -52,6 +53,7 @@ public class PipeMetaSyncer { PipeConfig.getInstance().getPipeMetaSyncerSyncIntervalMinutes(); private final ConfigManager configManager; + private final LogThrottler syncFailureLogThrottler = new LogThrottler(); private Future metaSyncFuture; @@ -95,10 +97,13 @@ private synchronized void sync() { isLastPipeSyncSuccessful = false; if (configManager.getPipeManager().getPipeTaskCoordinator().isLocked()) { - LOGGER.warn( - ManagerMessages.PIPETASKCOORDINATORLOCK_IS_HELD_BY_ANOTHER_THREAD_SKIP_THIS_ROUND_OF_2); + if (syncFailureLogThrottler.shouldLog("coordinator-lock-held", "locked")) { + LOGGER.warn( + ManagerMessages.PIPETASKCOORDINATORLOCK_IS_HELD_BY_ANOTHER_THREAD_SKIP_THIS_ROUND_OF_2); + } return; } + syncFailureLogThrottler.reset("coordinator-lock-held"); final ProcedureManager procedureManager = configManager.getProcedureManager(); @@ -129,27 +134,38 @@ private synchronized void sync() { procedureManager.pipeHandleMetaChangeWithBlock(true, false); if (handleMetaChangeStatus.getCode() == TSStatusCode.SUCCESS_STATUS.getStatusCode()) { successfulSync = true; + syncFailureLogThrottler.reset("handle-pipe-meta-change"); } else { - LOGGER.warn( - ManagerMessages.FAILED_TO_HANDLE_PIPE_META_CHANGE_RESULT_STATUS, - handleMetaChangeStatus); + if (syncFailureLogThrottler.shouldLog( + "handle-pipe-meta-change", getStatusFailureSignature(handleMetaChangeStatus))) { + LOGGER.warn( + ManagerMessages.FAILED_TO_HANDLE_PIPE_META_CHANGE_RESULT_STATUS, + handleMetaChangeStatus); + } } } if (successfulSync) { + syncFailureLogThrottler.reset(); LOGGER.info( ManagerMessages.AFTER_THIS_SUCCESSFUL_SYNC_IF_PIPETASKINFO_IS_EMPTY_DURING_THIS); isLastPipeSyncSuccessful = true; } } else { - LOGGER.warn(ManagerMessages.FAILED_TO_SYNC_PIPE_META_RESULT_STATUS, metaSyncStatus); + if (syncFailureLogThrottler.shouldLog( + "pipe-meta-sync", getStatusFailureSignature(metaSyncStatus))) { + LOGGER.warn(ManagerMessages.FAILED_TO_SYNC_PIPE_META_RESULT_STATUS, metaSyncStatus); + } + return; } + syncFailureLogThrottler.reset("pipe-meta-sync"); } public synchronized void stop() { if (metaSyncFuture != null) { metaSyncFuture.cancel(false); metaSyncFuture = null; + syncFailureLogThrottler.reset(); LOGGER.info(ManagerMessages.PIPEMETASYNCER_IS_STOPPED_SUCCESSFULLY); } } @@ -158,9 +174,12 @@ private boolean autoRestartWithLock() { final AtomicReference pipeTaskInfo = configManager.getPipeManager().getPipeTaskCoordinator().tryLock(); if (pipeTaskInfo == null) { - LOGGER.warn(ManagerMessages.FAILED_TO_ACQUIRE_PIPE_LOCK_FOR_AUTO_RESTART_PIPE_TASK); + if (syncFailureLogThrottler.shouldLog("auto-restart-lock", "locked")) { + LOGGER.warn(ManagerMessages.FAILED_TO_ACQUIRE_PIPE_LOCK_FOR_AUTO_RESTART_PIPE_TASK); + } return false; } + syncFailureLogThrottler.reset("auto-restart-lock"); try { return pipeTaskInfo.get().autoRestart(); } finally { @@ -175,8 +194,12 @@ private void checkAndRepairConsensusPipes() { .getEnv() .getRegionMaintainHandler() .checkAndRepairConsensusPipes(); + syncFailureLogThrottler.reset("check-and-repair-consensus-pipes"); } catch (Exception e) { - LOGGER.warn(ManagerMessages.FAILED_TO_CHECK_AND_REPAIR_CONSENSUS_PIPES, e); + if (syncFailureLogThrottler.shouldLog( + "check-and-repair-consensus-pipes", LogThrottler.getFailureSignature(e))) { + LOGGER.warn(ManagerMessages.FAILED_TO_CHECK_AND_REPAIR_CONSENSUS_PIPES, e); + } } } @@ -184,9 +207,12 @@ private boolean handleSuccessfulRestartWithLock() { final AtomicReference pipeTaskInfo = configManager.getPipeManager().getPipeTaskCoordinator().tryLock(); if (pipeTaskInfo == null) { - LOGGER.warn(ManagerMessages.FAILED_TO_ACQUIRE_PIPE_LOCK_FOR_HANDLING_SUCCESSFUL_RESTART); + if (syncFailureLogThrottler.shouldLog("handle-successful-restart-lock", "locked")) { + LOGGER.warn(ManagerMessages.FAILED_TO_ACQUIRE_PIPE_LOCK_FOR_HANDLING_SUCCESSFUL_RESTART); + } return false; } + syncFailureLogThrottler.reset("handle-successful-restart-lock"); try { pipeTaskInfo.get().handleSuccessfulRestart(); return true; @@ -194,4 +220,8 @@ private boolean handleSuccessfulRestartWithLock() { configManager.getPipeManager().getPipeTaskCoordinator().unlock(); } } + + private static String getStatusFailureSignature(TSStatus status) { + return status.getCode() + ":" + status.getMessage(); + } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/pipe/coordinator/runtime/heartbeat/PipeHeartbeatScheduler.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/pipe/coordinator/runtime/heartbeat/PipeHeartbeatScheduler.java index bbdaee8e12c3e..53e1edd731556 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/pipe/coordinator/runtime/heartbeat/PipeHeartbeatScheduler.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/pipe/coordinator/runtime/heartbeat/PipeHeartbeatScheduler.java @@ -25,6 +25,7 @@ import org.apache.iotdb.commons.concurrent.ThreadName; import org.apache.iotdb.commons.concurrent.threadpool.ScheduledExecutorUtil; import org.apache.iotdb.commons.pipe.config.PipeConfig; +import org.apache.iotdb.commons.utils.LogThrottler; import org.apache.iotdb.confignode.client.async.CnToDnAsyncRequestType; import org.apache.iotdb.confignode.client.async.CnToDnInternalServiceAsyncRequestManager; import org.apache.iotdb.confignode.client.async.handlers.DataNodeAsyncRequestContext; @@ -58,6 +59,7 @@ public class PipeHeartbeatScheduler { private final ConfigManager configManager; private final PipeHeartbeatParser pipeHeartbeatParser; + private final LogThrottler heartbeatFailureLogThrottler = new LogThrottler(); private Future heartbeatFuture; @@ -81,14 +83,18 @@ public synchronized void start() { private synchronized void heartbeat() { if (!configManager.getPipeManager().getPipeTaskCoordinator().hasAnyPipe()) { + heartbeatFailureLogThrottler.reset(); return; } if (configManager.getPipeManager().getPipeTaskCoordinator().isLocked()) { - LOGGER.warn( - ManagerMessages.PIPETASKCOORDINATORLOCK_IS_HELD_BY_ANOTHER_THREAD_SKIP_THIS_ROUND_OF); + if (heartbeatFailureLogThrottler.shouldLog("coordinator-lock-held", "locked")) { + LOGGER.warn( + ManagerMessages.PIPETASKCOORDINATORLOCK_IS_HELD_BY_ANOTHER_THREAD_SKIP_THIS_ROUND_OF); + } return; } + heartbeatFailureLogThrottler.reset("coordinator-lock-held"); // Data node heartbeat final Map dataNodeLocationMap = @@ -129,8 +135,12 @@ private synchronized void heartbeat() { null, configNodeResp.getPipeRemainingEventCountList(), configNodeResp.getPipeRemainingTimeList())); + heartbeatFailureLogThrottler.reset("config-node-pipe-meta-collect"); } catch (final Exception e) { - LOGGER.warn(ManagerMessages.FAILED_TO_COLLECT_PIPE_META_LIST_FROM_CONFIG_NODE_TASK, e); + if (heartbeatFailureLogThrottler.shouldLog( + "config-node-pipe-meta-collect", LogThrottler.getFailureSignature(e))) { + LOGGER.warn(ManagerMessages.FAILED_TO_COLLECT_PIPE_META_LIST_FROM_CONFIG_NODE_TASK, e); + } } } @@ -138,6 +148,7 @@ public synchronized void stop() { if (IS_SEPERATED_PIPE_HEARTBEAT_ENABLED && heartbeatFuture != null) { heartbeatFuture.cancel(false); heartbeatFuture = null; + heartbeatFailureLogThrottler.reset(); LOGGER.info(ManagerMessages.PIPEHEARTBEAT_IS_STOPPED_SUCCESSFULLY); } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/SubscriptionMetaSyncer.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/SubscriptionMetaSyncer.java index 307554231c736..f93a3c3e83fec 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/SubscriptionMetaSyncer.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/SubscriptionMetaSyncer.java @@ -24,6 +24,7 @@ import org.apache.iotdb.commons.concurrent.ThreadName; import org.apache.iotdb.commons.concurrent.threadpool.ScheduledExecutorUtil; import org.apache.iotdb.commons.subscription.config.SubscriptionConfig; +import org.apache.iotdb.commons.utils.LogThrottler; import org.apache.iotdb.confignode.i18n.ManagerMessages; import org.apache.iotdb.confignode.manager.ConfigManager; import org.apache.iotdb.confignode.manager.ProcedureManager; @@ -49,6 +50,7 @@ public class SubscriptionMetaSyncer { SubscriptionConfig.getInstance().getSubscriptionMetaSyncerSyncIntervalMinutes(); private final ConfigManager configManager; + private final LogThrottler syncFailureLogThrottler = new LogThrottler(); private Future metaSyncFuture; @@ -84,10 +86,14 @@ private synchronized void sync() { isLastSubscriptionSyncSuccessful = false; if (configManager.getSubscriptionManager().getSubscriptionCoordinator().isLocked()) { - LOGGER.warn( - ManagerMessages.SUBSCRIPTIONCOORDINATORLOCK_IS_HELD_BY_ANOTHER_THREAD_SKIP_THIS_ROUND_OF); + if (syncFailureLogThrottler.shouldLog("coordinator-lock-held", "locked")) { + LOGGER.warn( + ManagerMessages + .SUBSCRIPTIONCOORDINATORLOCK_IS_HELD_BY_ANOTHER_THREAD_SKIP_THIS_ROUND_OF); + } return; } + syncFailureLogThrottler.reset("coordinator-lock-held"); final ProcedureManager procedureManager = configManager.getProcedureManager(); @@ -95,26 +101,38 @@ private synchronized void sync() { // TODO: consider drop the topic which is subscribed by consumers final TSStatus topicMetaSyncStatus = procedureManager.topicMetaSync(); if (topicMetaSyncStatus.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.warn(ManagerMessages.FAILED_TO_SYNC_TOPIC_META_RESULT_STATUS, topicMetaSyncStatus); + if (syncFailureLogThrottler.shouldLog( + "topic-meta-sync", getStatusFailureSignature(topicMetaSyncStatus))) { + LOGGER.warn(ManagerMessages.FAILED_TO_SYNC_TOPIC_META_RESULT_STATUS, topicMetaSyncStatus); + } return; } + syncFailureLogThrottler.reset("topic-meta-sync"); // sync consumer meta if syncing topic meta successfully final TSStatus consumerGroupMetaSyncStatus = procedureManager.consumerGroupMetaSync(); if (consumerGroupMetaSyncStatus.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.warn( - ManagerMessages.FAILED_TO_SYNC_CONSUMER_GROUP_META_RESULT_STATUS, - consumerGroupMetaSyncStatus); + if (syncFailureLogThrottler.shouldLog( + "consumer-group-meta-sync", getStatusFailureSignature(consumerGroupMetaSyncStatus))) { + LOGGER.warn( + ManagerMessages.FAILED_TO_SYNC_CONSUMER_GROUP_META_RESULT_STATUS, + consumerGroupMetaSyncStatus); + } return; } + syncFailureLogThrottler.reset("consumer-group-meta-sync"); // sync commit progress if syncing consumer group meta successfully final TSStatus commitProgressSyncStatus = procedureManager.commitProgressSync(); if (commitProgressSyncStatus.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.warn("Failed to sync commit progress. Result status: {}.", commitProgressSyncStatus); + if (syncFailureLogThrottler.shouldLog( + "commit-progress-sync", getStatusFailureSignature(commitProgressSyncStatus))) { + LOGGER.warn("Failed to sync commit progress. Result status: {}.", commitProgressSyncStatus); + } return; } + syncFailureLogThrottler.reset(); LOGGER.info( ManagerMessages.AFTER_THIS_SUCCESSFUL_SYNC_IF_SUBSCRIPTIONINFO_IS_EMPTY_DURING_THIS); isLastSubscriptionSyncSuccessful = true; @@ -124,7 +142,12 @@ public synchronized void stop() { if (metaSyncFuture != null) { metaSyncFuture.cancel(false); metaSyncFuture = null; + syncFailureLogThrottler.reset(); LOGGER.info(ManagerMessages.SUBSCRIPTIONMETASYNCER_IS_STOPPED_SUCCESSFULLY); } } + + private static String getStatusFailureSignature(TSStatus status) { + return status.getCode() + ":" + status.getMessage(); + } } diff --git a/iotdb-core/datanode/pom.xml b/iotdb-core/datanode/pom.xml index 97a296c9a9e53..6ead8e269d268 100644 --- a/iotdb-core/datanode/pom.xml +++ b/iotdb-core/datanode/pom.xml @@ -481,6 +481,14 @@ org.apache.iotdb:isession at.yawk.lz4:lz4-java + + ch.qos.logback:logback-classic + ch.qos.logback:logback-core + + + ch.qos.logback:logback-classic + ch.qos.logback:logback-core + diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/auth/ClusterAuthorityFetcher.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/auth/ClusterAuthorityFetcher.java index 641ead173d2be..5910b9f7ae9c0 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/auth/ClusterAuthorityFetcher.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/auth/ClusterAuthorityFetcher.java @@ -72,6 +72,7 @@ import java.util.HashSet; import java.util.List; import java.util.Set; +import java.util.concurrent.atomic.AtomicLong; import java.util.function.BiFunction; import static org.apache.iotdb.commons.auth.utils.AuthUtils.constructAuthorityScope; @@ -89,6 +90,8 @@ public class ClusterAuthorityFetcher implements IAuthorityFetcher { ConfigNodeClientManager.getInstance(); private static final String CONNECTERROR = "Failed to connect to config node."; + private static final long CONNECT_ERROR_LOG_INTERVAL_MS = 60_000L; + private static final AtomicLong LAST_CONNECT_ERROR_LOG_TIME = new AtomicLong(0L); public ClusterAuthorityFetcher(IAuthorCache iAuthorCache) { this.iAuthorCache = iAuthorCache; @@ -370,8 +373,9 @@ private PathPatternTree fetchAuthizedPatternTree(String username, PrivilegeType try (ConfigNodeClient configNodeClient = CONFIG_NODE_CLIENT_MANAGER.borrowClient(ConfigNodeInfo.CONFIG_REGION_ID)) { authizedPatternTree = configNodeClient.fetchAuthizedPatternTree(req); + resetConfigNodeConnectionErrorLogTime(); } catch (ClientManagerException | TException e) { - LOGGER.error(CONNECTERROR); + logConfigNodeConnectionError(); authizedPatternTree.setStatus( RpcUtils.getStatus(TSStatusCode.EXECUTE_STATEMENT_ERROR, CONNECTERROR)); } @@ -397,6 +401,7 @@ private SettableFuture operatePermissionInternal( statementToAuthorizerReq((RelationalAuthorStatement) plan)) : configNodeClient.operatePermission( statementToAuthorizerReq((AuthorStatement) plan)); + resetConfigNodeConnectionErrorLogTime(); if (TSStatusCode.SUCCESS_STATUS.getStatusCode() != tsStatus.getCode()) { future.setException(new IoTDBException(tsStatus)); } else { @@ -406,7 +411,7 @@ private SettableFuture operatePermissionInternal( } catch (AuthException e) { future.setException(e); } catch (ClientManagerException | TException e) { - LOGGER.error(CONNECTERROR); + logConfigNodeConnectionError(); future.setException(e); } return future; @@ -471,6 +476,7 @@ private SettableFuture queryPermissionInternal( ? configNodeClient.queryRPermission( statementToAuthorizerReq((RelationalAuthorStatement) plan)) : configNodeClient.queryPermission(statementToAuthorizerReq((AuthorStatement) plan)); + resetConfigNodeConnectionErrorLogTime(); if (TSStatusCode.SUCCESS_STATUS.getStatusCode() != authorizerResp.getStatus().getCode()) { future.setException( new IoTDBException( @@ -481,7 +487,7 @@ private SettableFuture queryPermissionInternal( } catch (AuthException e) { future.setException(e); } catch (ClientManagerException | TException e) { - LOGGER.error(CONNECTERROR); + logConfigNodeConnectionError(); authorizerResp.setStatus( RpcUtils.getStatus(TSStatusCode.EXECUTE_STATEMENT_ERROR, CONNECTERROR)); future.setException(new IoTDBException(authorizerResp.getStatus())); @@ -561,8 +567,9 @@ public TSStatus checkUser( CONFIG_NODE_CLIENT_MANAGER.borrowClient(ConfigNodeInfo.CONFIG_REGION_ID)) { // Send request to some API server status = configNodeClient.login(req); + resetConfigNodeConnectionErrorLogTime(); } catch (ClientManagerException | TException e) { - LOGGER.error(CONNECTERROR); + logConfigNodeConnectionError(); status = new TPermissionInfoResp(); status.setStatus(RpcUtils.getStatus(TSStatusCode.EXECUTE_STATEMENT_ERROR, CONNECTERROR)); } finally { @@ -593,8 +600,9 @@ public User getUser(String userName, final boolean force) { CONFIG_NODE_CLIENT_MANAGER.borrowClient(ConfigNodeInfo.CONFIG_REGION_ID)) { // Send request to some API server permissionInfoResp = configNodeClient.getUser(userName); + resetConfigNodeConnectionErrorLogTime(); } catch (ClientManagerException | TException e) { - LOGGER.error(CONNECTERROR); + logConfigNodeConnectionError(); } if (permissionInfoResp != null && permissionInfoResp.getStatus().getCode() @@ -629,8 +637,9 @@ private TPermissionInfoResp checkPrivilegeFromConfigNode(TCheckUserPrivilegesReq CONFIG_NODE_CLIENT_MANAGER.borrowClient(ConfigNodeInfo.CONFIG_REGION_ID)) { // Send request to some API server permissionInfoResp = configNodeClient.checkUserPrivileges(req); + resetConfigNodeConnectionErrorLogTime(); } catch (ClientManagerException | TException e) { - LOGGER.error(CONNECTERROR); + logConfigNodeConnectionError(); permissionInfoResp = new TPermissionInfoResp(); permissionInfoResp.setStatus( RpcUtils.getStatus(TSStatusCode.EXECUTE_STATEMENT_ERROR, CONNECTERROR)); @@ -660,8 +669,9 @@ private boolean checkRoleFromConfigNode(String username, String rolename) { CONFIG_NODE_CLIENT_MANAGER.borrowClient(ConfigNodeInfo.CONFIG_REGION_ID)) { // Send request to some API server permissionInfoResp = configNodeClient.checkRoleOfUser(req); + resetConfigNodeConnectionErrorLogTime(); } catch (ClientManagerException | TException e) { - LOGGER.error(CONNECTERROR); + logConfigNodeConnectionError(); permissionInfoResp = new TPermissionInfoResp(); permissionInfoResp.setStatus( RpcUtils.getStatus(TSStatusCode.EXECUTE_STATEMENT_ERROR, CONNECTERROR)); @@ -715,6 +725,19 @@ public User cacheUser(TPermissionInfoResp tPermissionInfoResp) { return user; } + static void logConfigNodeConnectionError() { + long now = System.currentTimeMillis(); + long lastLogTime = LAST_CONNECT_ERROR_LOG_TIME.get(); + if ((lastLogTime == 0 || now - lastLogTime >= CONNECT_ERROR_LOG_INTERVAL_MS) + && LAST_CONNECT_ERROR_LOG_TIME.compareAndSet(lastLogTime, now)) { + LOGGER.error(CONNECTERROR); + } + } + + static void resetConfigNodeConnectionErrorLogTime() { + LAST_CONNECT_ERROR_LOG_TIME.set(0L); + } + /** Cache role. */ public Role cacheRole(String roleName, TPermissionInfoResp tPermissionInfoResp) { TRoleResp resp = tPermissionInfoResp.getRoleInfo().get(roleName); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/auth/LoginLockManager.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/auth/LoginLockManager.java index 6015d1d538f91..330e021f56790 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/auth/LoginLockManager.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/auth/LoginLockManager.java @@ -39,6 +39,8 @@ public class LoginLockManager { private static final Logger LOGGER = LoggerFactory.getLogger(LoginLockManager.class); + private static final int MULTIPLE_USERS_FOR_IP_WARNING_THRESHOLD = 50; + private static final int MULTIPLE_IPS_FOR_USER_WARNING_THRESHOLD = 100; // Configuration parameters private final int failedLoginAttempts; @@ -48,6 +50,8 @@ public class LoginLockManager { // Lock records storage (in-memory only) private final ConcurrentMap userLocks = new ConcurrentHashMap<>(); private final ConcurrentMap userIpLocks = new ConcurrentHashMap<>(); + private final Set warnedIpsWithMultipleUsers = ConcurrentHashMap.newKeySet(); + private final Set warnedUsersWithMultipleIpLocks = ConcurrentHashMap.newKeySet(); // Exempt users who should never be locked (only valid if request is from local host) private static final Set EXEMPT_USERS; @@ -214,7 +218,7 @@ public void recordFailure(long userId, String ip) { existing.addFailureTime(now); // Check if threshold reached (log only when it just reaches) int failCountIp = existing.getFailureCount(); - if (failCountIp >= failedLoginAttempts) { + if (failCountIp == failedLoginAttempts) { LOGGER.info(DataNodeMiscMessages.IP_LOCKED, ip, userId); } return existing; @@ -236,7 +240,7 @@ public void recordFailure(long userId, String ip) { existing.addFailureTime(now); // Check if threshold reached (log only when it just reaches) int failCountUser = existing.getFailureCount(); - if (failCountUser >= failedLoginAttemptsPerUser) { + if (failCountUser == failedLoginAttemptsPerUser) { LOGGER.info( "User ID '{}' locked due to {} failed attempts", userId, @@ -262,6 +266,7 @@ public void clearFailure(long userId, String ip) { String userIpKey = buildUserIpKey(userId, ip); userIpLocks.remove(userIpKey); userLocks.remove(userId); + resetPotentialAttackWarningsIfBelowThreshold(userId, ip); } /** @@ -272,15 +277,27 @@ public void clearFailure(long userId, String ip) { */ public void unlock(long userId, String ip) { if (ip == null || ip.isEmpty()) { + Set affectedIps = new HashSet<>(); + for (String key : userIpLocks.keySet()) { + if (key.startsWith(userId + "@")) { + String[] parts = key.split("@", 2); + if (parts.length == 2) { + affectedIps.add(parts[1]); + } + } + } // Unlock global user lock userLocks.remove(userId); // Also remove all IP locks for this user userIpLocks.keySet().removeIf(key -> key.startsWith(userId + "@")); + warnedUsersWithMultipleIpLocks.remove(userId); + affectedIps.forEach(this::resetIpWarningIfBelowThreshold); LOGGER.info(DataNodeMiscMessages.USER_UNLOCKED_MANUAL, userId); } else { // Unlock specific user@ip lock String userIpKey = buildUserIpKey(userId, ip); userIpLocks.remove(userIpKey); + resetPotentialAttackWarningsIfBelowThreshold(userId, ip); LOGGER.info(DataNodeMiscMessages.IP_UNLOCKED_MANUAL, ip, userId); } } @@ -289,6 +306,8 @@ public void unlock(long userId, String ip) { public void cleanExpiredLocks() { long now = System.currentTimeMillis(); long cutoffTime = now - (passwordLockTimeMinutes * 60 * 1000L); + Set affectedUsers = new HashSet<>(); + Set affectedIps = new HashSet<>(); // Clean expired user locks userLocks @@ -315,6 +334,10 @@ public void cleanExpiredLocks() { info.removeOldFailures(cutoffTime); if (info.getFailureCount() == 0) { final String[] parts = entry.getKey().split("@", 2); + if (parts.length == 2) { + affectedUsers.add(Long.parseLong(parts[0])); + affectedIps.add(parts[1]); + } LOGGER.info( DataNodeMiscMessages.IP_UNLOCKED_EXPIRED, parts.length == 2 ? parts[1] : "", @@ -323,6 +346,9 @@ public void cleanExpiredLocks() { } return false; }); + + affectedUsers.forEach(this::resetUserWarningIfBelowThreshold); + affectedIps.forEach(this::resetIpWarningIfBelowThreshold); } // Helper methods @@ -332,15 +358,21 @@ private String buildUserIpKey(long userId, String ip) { private void checkForPotentialAttacks(long userId, String ip) { // Check if IP is locked by many users - Set usersForIp = new HashSet<>(); - for (String key : userIpLocks.keySet()) { - if (key.endsWith("@" + ip)) { - usersForIp.add(Long.parseLong(key.split("@")[0])); + if (ip != null && !ip.isEmpty()) { + Set usersForIp = new HashSet<>(); + for (String key : userIpLocks.keySet()) { + if (key.endsWith("@" + ip)) { + usersForIp.add(Long.parseLong(key.split("@")[0])); + } } - } - if (usersForIp.size() > 50) { - LOGGER.warn(DataNodeMiscMessages.IP_LOCKED_MULTIPLE_USERS, ip, usersForIp.size()); + if (usersForIp.size() > MULTIPLE_USERS_FOR_IP_WARNING_THRESHOLD) { + if (warnedIpsWithMultipleUsers.add(ip)) { + LOGGER.warn(DataNodeMiscMessages.IP_LOCKED_MULTIPLE_USERS, ip, usersForIp.size()); + } + } else { + warnedIpsWithMultipleUsers.remove(ip); + } } // Check if user has many IP locks @@ -351,9 +383,52 @@ private void checkForPotentialAttacks(long userId, String ip) { } } - if (ipsForUser.size() > 100) { - LOGGER.warn(DataNodeMiscMessages.USER_MULTIPLE_IP_LOCKS, userId, ipsForUser.size()); + if (ipsForUser.size() > MULTIPLE_IPS_FOR_USER_WARNING_THRESHOLD) { + if (warnedUsersWithMultipleIpLocks.add(userId)) { + LOGGER.warn(DataNodeMiscMessages.USER_MULTIPLE_IP_LOCKS, userId, ipsForUser.size()); + } + } else { + warnedUsersWithMultipleIpLocks.remove(userId); + } + } + + private void resetPotentialAttackWarningsIfBelowThreshold(long userId, String ip) { + resetUserWarningIfBelowThreshold(userId); + if (ip != null && !ip.isEmpty()) { + resetIpWarningIfBelowThreshold(ip); + } + } + + private void resetUserWarningIfBelowThreshold(long userId) { + if (countIpsForUser(userId) <= MULTIPLE_IPS_FOR_USER_WARNING_THRESHOLD) { + warnedUsersWithMultipleIpLocks.remove(userId); + } + } + + private void resetIpWarningIfBelowThreshold(String ip) { + if (countUsersForIp(ip) <= MULTIPLE_USERS_FOR_IP_WARNING_THRESHOLD) { + warnedIpsWithMultipleUsers.remove(ip); + } + } + + private int countUsersForIp(String ip) { + Set usersForIp = new HashSet<>(); + for (String key : userIpLocks.keySet()) { + if (key.endsWith("@" + ip)) { + usersForIp.add(Long.parseLong(key.split("@")[0])); + } + } + return usersForIp.size(); + } + + private int countIpsForUser(long userId) { + Set ipsForUser = new HashSet<>(); + for (String key : userIpLocks.keySet()) { + if (key.startsWith(userId + "@")) { + ipsForUser.add(key.split("@")[1]); + } } + return ipsForUser.size(); } public static LoginLockManager getInstance() { diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index 7890802f36243..cd96e3c179423 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -405,6 +405,8 @@ public class DataNodeInternalRPCServiceImpl implements IDataNodeRPCService.Iface { private static final Logger LOGGER = LoggerFactory.getLogger(DataNodeInternalRPCServiceImpl.class); + private static final long DISK_FULL_WARNING_PRINT_INTERVAL_MS = 3600 * 1000L; + private static final AtomicLong LAST_DISK_FULL_WARNING_LOG_TIME = new AtomicLong(0L); private static final SessionManager SESSION_MANAGER = SessionManager.getInstance(); @@ -2499,19 +2501,21 @@ private void sampleDiskLoad(TLoadSample loadSample) { loadSample.setFreeDiskSpace(availableDisk); loadSample.setDiskUsageRate(1d - freeDiskRatio); // Reset NodeStatus if necessary - if (freeDiskRatio < commonConfig.getDiskSpaceWarningThreshold()) { - LOGGER.warn( - "The available disk space is : {}, " - + "the total disk space is : {}, " - + "and the remaining disk usage ratio: {} is " - + "less than disk_space_warning_threshold: {}, set system to readonly!", - RamUsageEstimator.humanReadableUnits((long) availableDisk), - RamUsageEstimator.humanReadableUnits((long) totalDisk), - freeDiskRatio, - commonConfig.getDiskSpaceWarningThreshold()); + updateDiskStatusAndMaybeLog(commonConfig, availableDisk, totalDisk, freeDiskRatio); + } + } + + static void updateDiskStatusAndMaybeLog( + CommonConfig commonConfig, double availableDisk, double totalDisk, double freeDiskRatio) { + if (freeDiskRatio < commonConfig.getDiskSpaceWarningThreshold()) { + logDiskFullWarningIfNecessary(commonConfig, availableDisk, totalDisk, freeDiskRatio); + if (!NodeStatus.ReadOnly.equals(commonConfig.getNodeStatus())) { commonConfig.setNodeStatus(NodeStatus.ReadOnly); - commonConfig.setStatusReason(NodeStatus.DISK_FULL); - } else if (NodeStatus.ReadOnly.equals(commonConfig.getNodeStatus()) + } + commonConfig.setStatusReason(NodeStatus.DISK_FULL); + } else { + resetDiskFullWarningLogTime(); + if (NodeStatus.ReadOnly.equals(commonConfig.getNodeStatus()) && NodeStatus.DISK_FULL.equals(commonConfig.getStatusReason())) { commonConfig.setNodeStatus(NodeStatus.Running); commonConfig.setStatusReason(null); @@ -2519,6 +2523,28 @@ private void sampleDiskLoad(TLoadSample loadSample) { } } + private static void logDiskFullWarningIfNecessary( + CommonConfig commonConfig, double availableDisk, double totalDisk, double freeDiskRatio) { + long now = System.currentTimeMillis(); + long lastLogTime = LAST_DISK_FULL_WARNING_LOG_TIME.get(); + if ((lastLogTime == 0 || now - lastLogTime >= DISK_FULL_WARNING_PRINT_INTERVAL_MS) + && LAST_DISK_FULL_WARNING_LOG_TIME.compareAndSet(lastLogTime, now)) { + LOGGER.warn( + "The available disk space is : {}, " + + "the total disk space is : {}, " + + "and the remaining disk usage ratio: {} is " + + "less than disk_space_warning_threshold: {}, set system to readonly!", + RamUsageEstimator.humanReadableUnits((long) availableDisk), + RamUsageEstimator.humanReadableUnits((long) totalDisk), + freeDiskRatio, + commonConfig.getDiskSpaceWarningThreshold()); + } + } + + static void resetDiskFullWarningLogTime() { + LAST_DISK_FULL_WARNING_LOG_TIME.set(0L); + } + @Override public TSStatus invalidatePermissionCache(TInvalidatePermissionCacheReq req) { if (req.isSetNeedDisconnect() && req.isNeedDisconnect()) { diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/IoTDBInternalLocalReporter.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/IoTDBInternalLocalReporter.java index 444fdd2e9899b..a114fafea4eba 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/IoTDBInternalLocalReporter.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/IoTDBInternalLocalReporter.java @@ -70,6 +70,7 @@ import java.util.Arrays; import java.util.List; import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.ScheduledExecutorService; @@ -80,15 +81,30 @@ public class IoTDBInternalLocalReporter extends IoTDBInternalReporter { private static final Logger LOGGER = LoggerFactory.getLogger(IoTDBInternalLocalReporter.class); - private static final SessionManager SESSION_MANAGER = SessionManager.getInstance(); - private static final Coordinator COORDINATOR = Coordinator.getInstance(); + static final long FAILURE_LOG_INTERVAL = TimeUnit.MINUTES.toMillis(5); + private static final String UPDATE_METRIC_CONNECTION_FAILURE = + "Failed to update the value of metric because of connection failure, because "; + private static final String UPDATE_METRIC_INTERNAL_FAILURE = + "Failed to update the value of metric because of internal error, because "; + private final SessionManager sessionManager; + private final Coordinator coordinator; private final SessionInfo sessionInfo; private final IPartitionFetcher partitionFetcher; private final ISchemaFetcher schemaFetcher; private Future currentServiceFuture; private final ScheduledExecutorService service = Executors.newSingleThreadScheduledExecutor(); + private final Map updateMetricStatusFailureLogStates = + new ConcurrentHashMap<>(); + private final Map updateMetricConnectionFailureLogStates = + new ConcurrentHashMap<>(); + private final Map updateMetricInternalFailureLogStates = + new ConcurrentHashMap<>(); + private final Map autoCreateTimeseriesFailureLogStates = + new ConcurrentHashMap<>(); public IoTDBInternalLocalReporter() { + sessionManager = SessionManager.getInstance(); + coordinator = Coordinator.getInstance(); partitionFetcher = ClusterPartitionFetcher.getInstance(); schemaFetcher = ClusterSchemaFetcher.getInstance(); sessionInfo = @@ -154,6 +170,7 @@ public boolean stop() { currentServiceFuture = null; } clear(); + clearFailureLogStates(); LOGGER.info(DataNodeMiscMessages.INTERNAL_REPORTER_STOP); return true; } @@ -174,14 +191,14 @@ protected void writeMetricToIoTDB(Map valueMap, String prefix, l result = insertRecord(valueMap, prefix, time); } if (result.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.warn(DataNodeMiscMessages.FAILED_UPDATE_METRIC_VALUE, result); + logUpdateMetricStatusFailureIfNecessary(prefix, result); + } else { + clearUpdateMetricFailureLogStates(prefix); } } catch (IoTDBConnectionException e1) { - LOGGER.warn( - "Failed to update the value of metric because of connection failure, because ", e1); + logUpdateMetricConnectionFailureIfNecessary(prefix, e1); } catch (IllegalPathException | QueryProcessException e2) { - LOGGER.warn( - "Failed to update the value of metric because of internal error, because ", e2); + logUpdateMetricInternalFailureIfNecessary(prefix, e2); } }); } @@ -208,9 +225,9 @@ private TSStatus insertRecord(Map valueMap, String prefix, long request.setIsAligned(false); InsertRowStatement s = StatementGenerator.createStatement(request); - final long queryId = SESSION_MANAGER.requestQueryId(); + final long queryId = sessionManager.requestQueryId(); ExecutionResult result = - COORDINATOR.executeForTreeModel( + coordinator.executeForTreeModel( s, queryId, sessionInfo, "", partitionFetcher, schemaFetcher); return result.status; } @@ -236,13 +253,15 @@ private void createTimeSeries(Map valueMap, String prefix) request.setEncodings(encodings); request.setCompressors(compressors); CreateMultiTimeSeriesStatement s = StatementGenerator.createStatement(request); - final long queryId = SESSION_MANAGER.requestQueryId(); + final long queryId = sessionManager.requestQueryId(); ExecutionResult result = - COORDINATOR.executeForTreeModel( + coordinator.executeForTreeModel( s, queryId, sessionInfo, "", partitionFetcher, schemaFetcher); if (result.status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.warn(DataNodeMiscMessages.FAILED_AUTO_CREATE_TIMESERIES, paths, result.status); + logAutoCreateTimeseriesFailureIfNecessary(prefix, paths, result.status); + } else { + clearFailureLogState(autoCreateTimeseriesFailureLogStates, prefix); } } @@ -252,4 +271,107 @@ protected void writeMetricsToIoTDB(Map> valueMap, lo writeMetricToIoTDB(value.getValue(), value.getKey(), time); } } + + private void logUpdateMetricStatusFailureIfNecessary(String prefix, TSStatus status) { + if (shouldLogFailure( + updateMetricStatusFailureLogStates, prefix, getStatusFailureSignature(status))) { + LOGGER.warn(DataNodeMiscMessages.FAILED_UPDATE_METRIC_VALUE, status); + } + } + + private void logUpdateMetricConnectionFailureIfNecessary( + String prefix, IoTDBConnectionException e) { + if (shouldLogFailure( + updateMetricConnectionFailureLogStates, prefix, getExceptionFailureSignature(e))) { + LOGGER.warn(UPDATE_METRIC_CONNECTION_FAILURE, e); + } + } + + private void logUpdateMetricInternalFailureIfNecessary(String prefix, Exception e) { + if (shouldLogFailure( + updateMetricInternalFailureLogStates, prefix, getExceptionFailureSignature(e))) { + LOGGER.warn(UPDATE_METRIC_INTERNAL_FAILURE, e); + } + } + + private void logAutoCreateTimeseriesFailureIfNecessary( + String prefix, List paths, TSStatus status) { + if (shouldLogFailure( + autoCreateTimeseriesFailureLogStates, prefix, getStatusFailureSignature(status))) { + LOGGER.warn(DataNodeMiscMessages.FAILED_AUTO_CREATE_TIMESERIES, paths, status); + } + } + + private void clearUpdateMetricFailureLogStates(String prefix) { + clearFailureLogState(updateMetricStatusFailureLogStates, prefix); + clearFailureLogState(updateMetricConnectionFailureLogStates, prefix); + clearFailureLogState(updateMetricInternalFailureLogStates, prefix); + } + + private void clearFailureLogStates() { + updateMetricStatusFailureLogStates.clear(); + updateMetricConnectionFailureLogStates.clear(); + updateMetricInternalFailureLogStates.clear(); + autoCreateTimeseriesFailureLogStates.clear(); + } + + private static String getStatusFailureSignature(TSStatus status) { + return status.getCode() + ":" + status.getMessage(); + } + + private static String getExceptionFailureSignature(Exception e) { + return e.getClass().getName() + ":" + e.getMessage(); + } + + static boolean shouldLogFailure(FailureLogState failureLogState, String failureSignature) { + return shouldLogFailure(failureLogState, failureSignature, System.currentTimeMillis()); + } + + static boolean shouldLogFailure( + Map failureLogStateMap, String key, String failureSignature) { + return shouldLogFailure( + failureLogStateMap.computeIfAbsent(key, ignored -> new FailureLogState()), + failureSignature); + } + + static boolean shouldLogFailure( + Map failureLogStateMap, + String key, + String failureSignature, + long currentTime) { + return shouldLogFailure( + failureLogStateMap.computeIfAbsent(key, ignored -> new FailureLogState()), + failureSignature, + currentTime); + } + + static boolean shouldLogFailure( + FailureLogState failureLogState, String failureSignature, long currentTime) { + synchronized (failureLogState) { + if (!failureSignature.equals(failureLogState.lastFailureSignature) + || currentTime >= failureLogState.nextLogTime) { + failureLogState.lastFailureSignature = failureSignature; + failureLogState.nextLogTime = currentTime + FAILURE_LOG_INTERVAL; + return true; + } + return false; + } + } + + static void clearFailureLogState(FailureLogState failureLogState) { + synchronized (failureLogState) { + failureLogState.lastFailureSignature = ""; + failureLogState.nextLogTime = 0L; + } + } + + static void clearFailureLogState( + Map failureLogStateMap, String failureKey) { + failureLogStateMap.remove(failureKey); + } + + static class FailureLogState { + private String lastFailureSignature = ""; + private long nextLogTime = 0L; + } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/file/SystemRelatedFileMetrics.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/file/SystemRelatedFileMetrics.java index 0ae45519888b3..ec1f2baabc752 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/file/SystemRelatedFileMetrics.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/service/metrics/file/SystemRelatedFileMetrics.java @@ -21,6 +21,7 @@ import org.apache.iotdb.commons.service.metric.enums.Metric; import org.apache.iotdb.commons.service.metric.enums.Tag; +import org.apache.iotdb.commons.utils.LogThrottler; import org.apache.iotdb.db.i18n.DataNodeMiscMessages; import org.apache.iotdb.metrics.AbstractMetricService; import org.apache.iotdb.metrics.config.MetricConfig; @@ -47,6 +48,7 @@ public class SystemRelatedFileMetrics implements IMetricSet { private static final Logger LOGGER = LoggerFactory.getLogger(SystemRelatedFileMetrics.class); private static final MetricConfig CONFIG = MetricConfigDescriptor.getInstance().getMetricConfig(); private final Runtime runtime = Runtime.getRuntime(); + private final LogThrottler openFileHandlersFailureLogThrottler = new LogThrottler(); private String[] getOpenFileNumberCommand; @SuppressWarnings("squid:S1075") @@ -100,10 +102,14 @@ private long getOpenFileHandlersNumber() { WinNT.HANDLE hProcess = Kernel32.INSTANCE.GetCurrentProcess(); IntByReference handleCount = new IntByReference(); boolean success = Kernel32Ext.INSTANCE.GetProcessHandleCount(hProcess, handleCount); + openFileHandlersFailureLogThrottler.reset(); return success ? handleCount.getValue() : 0L; } - } catch (IOException e) { - LOGGER.warn(DataNodeMiscMessages.FAILED_GET_OPEN_FILE_NUMBER, e); + openFileHandlersFailureLogThrottler.reset(); + } catch (IOException | NumberFormatException e) { + if (openFileHandlersFailureLogThrottler.shouldLog(LogThrottler.getFailureSignature(e))) { + LOGGER.warn(DataNodeMiscMessages.FAILED_GET_OPEN_FILE_NUMBER, e); + } } return fdCount; } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/generator/TsFileNameGenerator.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/generator/TsFileNameGenerator.java index 37c8f1a964fa1..67176298f6b0e 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/generator/TsFileNameGenerator.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/generator/TsFileNameGenerator.java @@ -38,6 +38,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; +import java.util.concurrent.atomic.AtomicLong; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -48,6 +49,8 @@ public class TsFileNameGenerator { private static FSFactory fsFactory = FSFactoryProducer.getFSFactory(); private static final Logger LOGGER = LoggerFactory.getLogger(TsFileNameGenerator.class); + private static final long ALL_DISKS_FULL_LOG_INTERVAL_MS = 3600 * 1000L; + private static final AtomicLong LAST_ALL_DISKS_FULL_LOG_TIME = new AtomicLong(0L); public static String generateNewTsFilePath( String tsFileDir, @@ -98,36 +101,39 @@ public static String generateNewTsFilePathWithMkdir( throws DiskSpaceInsufficientException, IOException { FolderManager folderManager = TierManager.getInstance().getFolderManager(tierLevel, sequence); try { - return folderManager.getNextWithRetry( - baseDir -> { - String tsFileDir = - baseDir + String tsFilePath = + folderManager.getNextWithRetry( + baseDir -> { + String tsFileDir = + baseDir + + File.separator + + logicalStorageGroup + + File.separator + + virtualStorageGroup + + File.separator + + timePartitionId; + File targetDir = fsFactory.getFile(tsFileDir); + if (!targetDir.exists()) { + if (!targetDir.mkdirs() && !targetDir.exists()) { + throw new IOException( + "Directory creation failed: " + + tsFileDir + + " (Permission denied or parent not writable)"); + } + } + return tsFileDir + File.separator - + logicalStorageGroup - + File.separator - + virtualStorageGroup - + File.separator - + timePartitionId; - File targetDir = fsFactory.getFile(tsFileDir); - if (!targetDir.exists()) { - if (!targetDir.mkdirs() && !targetDir.exists()) { - throw new IOException( - "Directory creation failed: " - + tsFileDir - + " (Permission denied or parent not writable)"); - } - } - return tsFileDir - + File.separator - + generateNewTsFileName( - time, - version, - innerSpaceCompactionCount, - crossSpaceCompactionCount, - customSuffix); - }); + + generateNewTsFileName( + time, + version, + innerSpaceCompactionCount, + crossSpaceCompactionCount, + customSuffix); + }); + resetAllDisksFullLogTime(); + return tsFilePath; } catch (DiskSpaceInsufficientException e) { - LOGGER.error(StorageEngineMessages.ALL_DISKS_FULL_CANNOT_CREATE_TSFILE_DIR, e); + logAllDisksFullCannotCreateTsFileDirIfNecessary(e); throw new IOException(StorageEngineMessages.DISK_SPACE_INSUFFICIENT, e); } catch (Exception e) { LOGGER.warn(StorageEngineMessages.FAILED_TO_CREATE_TSFILE_DIR_AFTER_RETRIES, e); @@ -135,6 +141,19 @@ public static String generateNewTsFilePathWithMkdir( } } + static void logAllDisksFullCannotCreateTsFileDirIfNecessary(DiskSpaceInsufficientException e) { + long now = System.currentTimeMillis(); + long lastLogTime = LAST_ALL_DISKS_FULL_LOG_TIME.get(); + if ((lastLogTime == 0 || now - lastLogTime >= ALL_DISKS_FULL_LOG_INTERVAL_MS) + && LAST_ALL_DISKS_FULL_LOG_TIME.compareAndSet(lastLogTime, now)) { + LOGGER.error(StorageEngineMessages.ALL_DISKS_FULL_CANNOT_CREATE_TSFILE_DIR, e); + } + } + + static void resetAllDisksFullLogTime() { + LAST_ALL_DISKS_FULL_LOG_TIME.set(0L); + } + public static String generateNewTsFileName( long time, long version, int innerSpaceCompactionCount, int crossSpaceCompactionCount) { return generateNewTsFileName( diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/WALManager.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/WALManager.java index 00eb47d40ae2b..14bbb82931bde 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/WALManager.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/WALManager.java @@ -60,6 +60,7 @@ public class WALManager implements IService { private static final Logger logger = LoggerFactory.getLogger(WALManager.class); private static final IoTDBConfig config = IoTDBDescriptor.getInstance().getConfig(); + private static final long WAL_THROTTLE_WARNING_PRINT_INTERVAL_MS = 3600 * 1000L; // manage all wal nodes and decide how to allocate them private final NodeAllocationStrategy walNodesManager; @@ -69,6 +70,7 @@ public class WALManager implements IService { private final AtomicLong totalDiskUsage = new AtomicLong(); // total number of wal files private final AtomicLong totalFileNum = new AtomicLong(); + private final AtomicLong lastWalThrottleWarningLogTime = new AtomicLong(0L); private WALManager() { if (config.getDataRegionConsensusProtocolClass().equals(ConsensusFactory.IOT_CONSENSUS) @@ -184,10 +186,7 @@ private void deleteOutdatedFiles() { while ((firstLoop || shouldThrottle())) { deleteOutdatedFilesInWALNodes(); if (firstLoop && shouldThrottle()) { - logger.warn( - "WAL disk usage {} is larger than the wal_throttle_threshold_in_byte * 0.8 {}, please check your write load, iot consensus and the pipe module. It's better to allocate more disk for WAL.", - getTotalDiskUsage(), - getThrottleThreshold()); + logWalThrottleWarningIfNecessary(); } firstLoop = false; if (Thread.interrupted()) { @@ -195,6 +194,9 @@ private void deleteOutdatedFiles() { return; } } + if (!shouldThrottle()) { + resetWalThrottleWarningLogTime(); + } } protected void deleteOutdatedFilesInWALNodes() { @@ -253,6 +255,9 @@ public void addTotalDiskUsage(long size) { public void subtractTotalDiskUsage(long size) { totalDiskUsage.accumulateAndGet(size, (x, y) -> x - y); + if (!shouldThrottle()) { + resetWalThrottleWarningLogTime(); + } } public long getTotalFileNum() { @@ -321,9 +326,30 @@ public void syncDeleteOutdatedFilesInWALNodes() { public void clear() { totalDiskUsage.set(0); + resetWalThrottleWarningLogTime(); walNodesManager.clear(); } + void logWalThrottleWarningIfNecessary() { + if (!shouldThrottle()) { + resetWalThrottleWarningLogTime(); + return; + } + long now = System.currentTimeMillis(); + long lastLogTime = lastWalThrottleWarningLogTime.get(); + if ((lastLogTime == 0 || now - lastLogTime >= WAL_THROTTLE_WARNING_PRINT_INTERVAL_MS) + && lastWalThrottleWarningLogTime.compareAndSet(lastLogTime, now)) { + logger.warn( + "WAL disk usage {} is larger than the wal_throttle_threshold_in_byte * 0.8 {}, please check your write load, iot consensus and the pipe module. It's better to allocate more disk for WAL.", + getTotalDiskUsage(), + getThrottleThreshold()); + } + } + + void resetWalThrottleWarningLogTime() { + lastWalThrottleWarningLogTime.set(0L); + } + @Override public ServiceType getID() { return ServiceType.WAL_SERVICE; diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/allocation/AbstractNodeAllocationStrategy.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/allocation/AbstractNodeAllocationStrategy.java index de97bb313f652..99806a4533309 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/allocation/AbstractNodeAllocationStrategy.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/allocation/AbstractNodeAllocationStrategy.java @@ -35,14 +35,17 @@ import java.io.File; import java.io.IOException; import java.util.Arrays; +import java.util.concurrent.atomic.AtomicLong; public abstract class AbstractNodeAllocationStrategy implements NodeAllocationStrategy { private static final Logger logger = LoggerFactory.getLogger(AbstractNodeAllocationStrategy.class); private static final CommonConfig commonConfig = CommonDescriptor.getInstance().getConfig(); + private static final long FAIL_TO_CREATE_WAL_NODE_DISKS_FULL_LOG_INTERVAL_MS = 3600 * 1000L; // manage wal folders protected FolderManager folderManager; + private final AtomicLong lastFailToCreateWalNodeDisksFullLogTime = new AtomicLong(0L); protected AbstractNodeAllocationStrategy() { try { @@ -52,9 +55,7 @@ protected AbstractNodeAllocationStrategy() { } catch (DiskSpaceInsufficientException e) { // folderManager remains null when disk space is insufficient during initialization // It will be lazily initialized later when disk space becomes available - logger.error( - "Fail to create wal node allocation strategy because all disks of wal folders are full.", - e); + logFailToCreateWalNodeDisksFullIfNecessary(e); } } @@ -68,10 +69,13 @@ protected synchronized IWALNode createWALNode(String identifier) { new FolderManager( Arrays.asList(commonConfig.getWalDirs()), DirectoryStrategyType.SEQUENCE_STRATEGY); } - return folderManager.getNextWithRetry( - folder -> new WALNode(identifier, folder + File.separator + identifier)); + WALNode walNode = + folderManager.getNextWithRetry( + folder -> new WALNode(identifier, folder + File.separator + identifier)); + resetFailToCreateWalNodeDisksFullLogTime(); + return walNode; } catch (DiskSpaceInsufficientException e) { - logger.error(StorageEngineMessages.FAIL_TO_CREATE_WAL_NODE_DISKS_FULL, e); + logFailToCreateWalNodeDisksFullIfNecessary(e); return WALFakeNode.getFailureInstance(e); } catch (Exception e) { logger.warn(StorageEngineMessages.FAILED_TO_CREATE_WAL_NODE_AFTER_RETRIES + identifier, e); @@ -84,10 +88,26 @@ protected synchronized IWALNode createWALNode(String identifier) { protected IWALNode createWALNode( String identifier, String folder, long startFileVersion, long startSearchIndex) { try { - return new WALNode(identifier, folder, startFileVersion, startSearchIndex); + WALNode walNode = new WALNode(identifier, folder, startFileVersion, startSearchIndex); + resetFailToCreateWalNodeDisksFullLogTime(); + return walNode; } catch (IOException e) { logger.error(StorageEngineMessages.FAIL_TO_CREATE_WAL_NODE, e); return WALFakeNode.getFailureInstance(e); } } + + private void logFailToCreateWalNodeDisksFullIfNecessary(DiskSpaceInsufficientException e) { + long now = System.currentTimeMillis(); + long lastLogTime = lastFailToCreateWalNodeDisksFullLogTime.get(); + if ((lastLogTime == 0 + || now - lastLogTime >= FAIL_TO_CREATE_WAL_NODE_DISKS_FULL_LOG_INTERVAL_MS) + && lastFailToCreateWalNodeDisksFullLogTime.compareAndSet(lastLogTime, now)) { + logger.error(StorageEngineMessages.FAIL_TO_CREATE_WAL_NODE_DISKS_FULL, e); + } + } + + void resetFailToCreateWalNodeDisksFullLogTime() { + lastFailToCreateWalNodeDisksFullLogTime.set(0L); + } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/load/active/ActiveLoadDirScanner.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/load/active/ActiveLoadDirScanner.java index 2a3ddee11f7b8..9b0d810b14d63 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/load/active/ActiveLoadDirScanner.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/load/active/ActiveLoadDirScanner.java @@ -44,8 +44,12 @@ import java.util.Map; import java.util.Objects; import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; import java.util.concurrent.CopyOnWriteArraySet; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicReference; import java.util.stream.Stream; @@ -53,12 +57,22 @@ public class ActiveLoadDirScanner extends ActiveLoadScheduledExecutorService { private static final Logger LOGGER = LoggerFactory.getLogger(ActiveLoadDirScanner.class); + private static final long ACTIVE_LOAD_FAILURE_LOG_INTERVAL_MS = TimeUnit.HOURS.toMillis(1); + private final AtomicReference listeningDirsConfig = new AtomicReference<>(); private final Set listeningDirs = new CopyOnWriteArraySet<>(); private final Set noPermissionDirs = new CopyOnWriteArraySet<>(); private final AtomicBoolean isReadOnlyLogPrinted = new AtomicBoolean(false); + private final AtomicLong lastScanFailureLogTime = new AtomicLong(0L); + private final AtomicLong lastHotReloadFailureLogTime = new AtomicLong(0L); + private final ConcurrentMap lastDirScanFailureLogTimeMap = + new ConcurrentHashMap<>(); + private final ConcurrentMap lastPermissionCheckFailureLogTimeMap = + new ConcurrentHashMap<>(); + private final ConcurrentMap lastCreateDirectoryFailureLogTimeMap = + new ConcurrentHashMap<>(); private final ActiveLoadTsFileLoader activeLoadTsFileLoader; @@ -73,8 +87,11 @@ public ActiveLoadDirScanner(final ActiveLoadTsFileLoader activeLoadTsFileLoader) private void scanSafely() { try { scan(); + lastScanFailureLogTime.set(0L); } catch (final Exception e) { - LOGGER.warn(StorageEngineMessages.ERROR_ACTIVE_LOAD_DIR_SCANNING, e); + if (shouldPrintFailureLog(lastScanFailureLogTime)) { + LOGGER.warn(StorageEngineMessages.ERROR_ACTIVE_LOAD_DIR_SCANNING, e); + } } } @@ -132,10 +149,14 @@ private void scan() throws IOException { isTableModel, isGeneratedByPipe); }); + lastDirScanFailureLogTimeMap.remove(listeningDir); } catch (UncheckedIOException e) { LOGGER.debug(StorageEngineMessages.FILE_DELETED_IGNORE_EXCEPTION); + lastDirScanFailureLogTimeMap.remove(listeningDir); } catch (final Exception e) { - LOGGER.warn(StorageEngineMessages.EXCEPTION_SCANNING_DIR, listeningDir, e); + if (shouldPrintFailureLog(lastDirScanFailureLogTimeMap, listeningDir)) { + LOGGER.warn(StorageEngineMessages.EXCEPTION_SCANNING_DIR, listeningDir, e); + } } } } @@ -168,12 +189,15 @@ private boolean checkPermission(final String listeningDir) { } noPermissionDirs.remove(listeningDir); + lastPermissionCheckFailureLogTimeMap.remove(listeningDir); return true; } catch (final Exception e) { - LOGGER.error( - "Error occurred during checking r/w permission of dir: {}. Skip scanning this dir.", - listeningDir, - e); + if (shouldPrintFailureLog(lastPermissionCheckFailureLogTimeMap, listeningDir)) { + LOGGER.error( + "Error occurred during checking r/w permission of dir: {}. Skip scanning this dir.", + listeningDir, + e); + } return false; } } @@ -213,23 +237,42 @@ private void hotReloadActiveLoadDirs() { ActiveLoadingFilesNumberMetricsSet.getInstance().updatePendingDirList(listeningDirs); ActiveLoadingFilesSizeMetricsSet.getInstance().updatePendingDirList(listeningDirs); + lastHotReloadFailureLogTime.set(0L); } catch (final Exception e) { - LOGGER.warn( - "Error occurred during hot reload active load dirs. " - + "Current active load listening dirs: {}.", - listeningDirs, - e); + if (shouldPrintFailureLog(lastHotReloadFailureLogTime)) { + LOGGER.warn( + "Error occurred during hot reload active load dirs. " + + "Current active load listening dirs: {}.", + listeningDirs, + e); + } } } private void createDirectoriesIfNotExists(final String dirPath) { try { FileUtils.forceMkdir(new File(dirPath)); + lastCreateDirectoryFailureLogTimeMap.remove(dirPath); } catch (final IOException e) { - LOGGER.warn(StorageEngineMessages.ERROR_CREATING_DIR_FOR_ACTIVE_LOAD, dirPath, e); + if (shouldPrintFailureLog(lastCreateDirectoryFailureLogTimeMap, dirPath)) { + LOGGER.warn(StorageEngineMessages.ERROR_CREATING_DIR_FOR_ACTIVE_LOAD, dirPath, e); + } } } + private static boolean shouldPrintFailureLog(AtomicLong lastLogTime) { + long now = System.currentTimeMillis(); + long previousLogTime = lastLogTime.get(); + return (previousLogTime == 0L || now - previousLogTime >= ACTIVE_LOAD_FAILURE_LOG_INTERVAL_MS) + && lastLogTime.compareAndSet(previousLogTime, now); + } + + private static boolean shouldPrintFailureLog( + ConcurrentMap lastLogTimeMap, String key) { + return shouldPrintFailureLog( + lastLogTimeMap.computeIfAbsent(String.valueOf(key), value -> new AtomicLong(0L))); + } + // Metrics public long countAndReportActiveListeningDirsFileNumber() { long totalFileCount = 0; diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/load/active/ActiveLoadScheduledExecutorService.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/load/active/ActiveLoadScheduledExecutorService.java index 6bd266458407a..eaca8e828cbbf 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/load/active/ActiveLoadScheduledExecutorService.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/load/active/ActiveLoadScheduledExecutorService.java @@ -36,12 +36,15 @@ import java.util.concurrent.Future; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; public abstract class ActiveLoadScheduledExecutorService { private static final Logger LOGGER = LoggerFactory.getLogger(ActiveLoadScheduledExecutorService.class); + private static final long ACTIVE_LOAD_JOB_FAILURE_LOG_INTERVAL_MS = TimeUnit.HOURS.toMillis(1); + protected static final IoTDBConfig IOTDB_CONFIG = IoTDBDescriptor.getInstance().getConfig(); private static final long MIN_EXECUTION_INTERVAL_SECONDS = @@ -59,19 +62,36 @@ protected ActiveLoadScheduledExecutorService(final ThreadName threadName) { public void register(Runnable runnable) { jobs.add( new Pair<>( - new WrappedRunnable() { - @Override - public void runMayThrow() { - try { - runnable.run(); - } catch (Exception e) { - LOGGER.warn(StorageEngineMessages.ERROR_EXECUTING_ACTIVE_LOAD_JOB, e); - } - } - }, + wrapRunnableWithFailureLogThrottle(runnable), Math.max(MIN_EXECUTION_INTERVAL_SECONDS, 1))); } + static WrappedRunnable wrapRunnableWithFailureLogThrottle(Runnable runnable) { + return new WrappedRunnable() { + private final AtomicLong lastFailureLogTime = new AtomicLong(0L); + + @Override + public void runMayThrow() { + try { + runnable.run(); + lastFailureLogTime.set(0L); + } catch (Exception e) { + logActiveLoadJobFailureIfNecessary(e, lastFailureLogTime); + } + } + }; + } + + private static void logActiveLoadJobFailureIfNecessary( + Exception e, AtomicLong lastFailureLogTime) { + long now = System.currentTimeMillis(); + long previousLogTime = lastFailureLogTime.get(); + if ((previousLogTime == 0L || now - previousLogTime >= ACTIVE_LOAD_JOB_FAILURE_LOG_INTERVAL_MS) + && lastFailureLogTime.compareAndSet(previousLogTime, now)) { + LOGGER.warn(StorageEngineMessages.ERROR_EXECUTING_ACTIVE_LOAD_JOB, e); + } + } + public synchronized void start() { if (future == null) { future = diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/trigger/service/TriggerInformationUpdater.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/trigger/service/TriggerInformationUpdater.java index 3b37eee7d1ff4..0954007768117 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/trigger/service/TriggerInformationUpdater.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/trigger/service/TriggerInformationUpdater.java @@ -26,6 +26,7 @@ import org.apache.iotdb.commons.consensus.ConfigRegionId; import org.apache.iotdb.commons.exception.IoTDBException; import org.apache.iotdb.commons.trigger.TriggerInformation; +import org.apache.iotdb.commons.utils.LogThrottler; import org.apache.iotdb.confignode.rpc.thrift.TGetTriggerTableResp; import org.apache.iotdb.db.i18n.DataNodeMiscMessages; import org.apache.iotdb.db.protocol.client.ConfigNodeClient; @@ -56,6 +57,7 @@ public class TriggerInformationUpdater { private Future updateFuture; private static final long UPDATE_INTERVAL = 1000L * 60; + private final LogThrottler updateFailureLogThrottler = new LogThrottler(); public void startTriggerInformationUpdater() { if (updateFuture == null) { @@ -97,8 +99,11 @@ public void updateTask() { .updateLocationOfStatefulTrigger( triggerInformation.getTriggerName(), triggerInformation.getDataNodeLocation()); } + updateFailureLogThrottler.reset(); } catch (Exception e) { - LOGGER.warn(DataNodeMiscMessages.ERROR_UPDATING_TRIGGER_INFO, e); + if (updateFailureLogThrottler.shouldLog(LogThrottler.getFailureSignature(e))) { + LOGGER.warn(DataNodeMiscMessages.ERROR_UPDATING_TRIGGER_INFO, e); + } } } } diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/auth/AuthorizerManagerTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/auth/AuthorizerManagerTest.java index 7023a3d9fac13..9ad64081ff9b5 100644 --- a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/auth/AuthorizerManagerTest.java +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/auth/AuthorizerManagerTest.java @@ -30,8 +30,11 @@ import org.apache.iotdb.confignode.rpc.thrift.TUserResp; import org.apache.iotdb.rpc.TSStatusCode; +import ch.qos.logback.classic.spi.ILoggingEvent; +import ch.qos.logback.core.read.ListAppender; import org.junit.Assert; import org.junit.Test; +import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.Collections; @@ -44,6 +47,31 @@ public class AuthorizerManagerTest { ClusterAuthorityFetcher authorityFetcher = new ClusterAuthorityFetcher(new BasicAuthorityCache()); + @Test + public void configNodeConnectionErrorLoggedOnlyOnceUntilReset() { + ch.qos.logback.classic.Logger logger = + (ch.qos.logback.classic.Logger) LoggerFactory.getLogger(ClusterAuthorityFetcher.class); + ListAppender appender = new ListAppender<>(); + appender.setContext(logger.getLoggerContext()); + appender.start(); + logger.addAppender(appender); + + ClusterAuthorityFetcher.resetConfigNodeConnectionErrorLogTime(); + try { + ClusterAuthorityFetcher.logConfigNodeConnectionError(); + ClusterAuthorityFetcher.logConfigNodeConnectionError(); + Assert.assertEquals(1, countLogEvents(appender, "Failed to connect to config node.")); + + ClusterAuthorityFetcher.resetConfigNodeConnectionErrorLogTime(); + ClusterAuthorityFetcher.logConfigNodeConnectionError(); + Assert.assertEquals(2, countLogEvents(appender, "Failed to connect to config node.")); + } finally { + ClusterAuthorityFetcher.resetConfigNodeConnectionErrorLogTime(); + logger.detachAppender(appender); + appender.stop(); + } + } + @Test public void permissionCacheTest() throws IllegalPathException { User user = new User(); @@ -258,4 +286,10 @@ public void grantOptTest() throws IllegalPathException { TSStatusCode.SUCCESS_STATUS.getStatusCode(), authorityFetcher.checkUserSysPrivilegesGrantOpt("user1", PrivilegeType.USE_CQ).getCode()); } + + private long countLogEvents(ListAppender appender, String messagePattern) { + return appender.list.stream() + .filter(event -> messagePattern.equals(event.getMessage())) + .count(); + } } diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/auth/LoginLockManagerTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/auth/LoginLockManagerTest.java index 0a9e95886a82c..6121dbbb0ce26 100644 --- a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/auth/LoginLockManagerTest.java +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/auth/LoginLockManagerTest.java @@ -20,9 +20,14 @@ package org.apache.iotdb.db.auth; import org.apache.iotdb.db.auth.LoginLockManager.UserLockInfo; +import org.apache.iotdb.db.i18n.DataNodeMiscMessages; +import ch.qos.logback.classic.Level; +import ch.qos.logback.classic.spi.ILoggingEvent; +import ch.qos.logback.core.read.ListAppender; import org.junit.Before; import org.junit.Test; +import org.slf4j.LoggerFactory; import java.lang.reflect.Field; import java.net.InetAddress; @@ -141,6 +146,72 @@ public void testIpLockAfterMaxAttempts() { lockManager.checkLock(TEST_USER_ID, TEST_IP)); } + @Test + public void testLockMessagesLoggedOnlyWhenThresholdFirstReached() { + ch.qos.logback.classic.Logger logger = + (ch.qos.logback.classic.Logger) LoggerFactory.getLogger(LoginLockManager.class); + Level previousLevel = logger.getLevel(); + logger.setLevel(Level.INFO); + ListAppender appender = new ListAppender<>(); + appender.setContext(logger.getLoggerContext()); + appender.start(); + logger.addAppender(appender); + + try { + for (int i = 0; i < failedLoginAttemptsPerUser + 2; i++) { + lockManager.recordFailure(TEST_USER_ID, TEST_IP); + } + + assertEquals(1, countLogEvents(appender, DataNodeMiscMessages.IP_LOCKED)); + assertEquals(1, countLogEvents(appender, "User ID '{}' locked due to {} failed attempts")); + } finally { + logger.detachAppender(appender); + logger.setLevel(previousLevel); + appender.stop(); + } + } + + @Test + public void testPotentialAttackWarningsLoggedOnlyOnceWhileThresholdExceeded() { + ch.qos.logback.classic.Logger logger = + (ch.qos.logback.classic.Logger) LoggerFactory.getLogger(LoginLockManager.class); + Level previousLevel = logger.getLevel(); + logger.setLevel(Level.INFO); + ListAppender appender = new ListAppender<>(); + appender.setContext(logger.getLoggerContext()); + appender.start(); + logger.addAppender(appender); + + try { + for (int i = 0; i < 55; i++) { + lockManager.recordFailure(TEST_USER_ID + i, TEST_IP); + } + for (int i = 0; i < 5; i++) { + lockManager.recordFailure(TEST_USER_ID + i, TEST_IP); + } + + for (int i = 0; i < 105; i++) { + lockManager.recordFailure(OTHER_USER_ID, "172.16.0." + i); + } + for (int i = 0; i < 5; i++) { + lockManager.recordFailure(OTHER_USER_ID, "172.16.0." + i); + } + + assertEquals(1, countLogEvents(appender, DataNodeMiscMessages.IP_LOCKED_MULTIPLE_USERS)); + assertEquals(1, countLogEvents(appender, DataNodeMiscMessages.USER_MULTIPLE_IP_LOCKS)); + } finally { + logger.detachAppender(appender); + logger.setLevel(previousLevel); + appender.stop(); + } + } + + private long countLogEvents(ListAppender appender, String messagePattern) { + return appender.list.stream() + .filter(event -> messagePattern.equals(event.getMessage())) + .count(); + } + @Test public void testGlobalUserLockAfterMaxAttempts() { for (int i = 0; i < failedLoginAttemptsPerUser; i++) { diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/conf/directories/strategy/DirectoryStrategyTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/conf/directories/strategy/DirectoryStrategyTest.java index 6343dcc169941..a6a9b9e9c2b6f 100644 --- a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/conf/directories/strategy/DirectoryStrategyTest.java +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/conf/directories/strategy/DirectoryStrategyTest.java @@ -18,14 +18,19 @@ */ package org.apache.iotdb.db.conf.directories.strategy; +import org.apache.iotdb.commons.disk.strategy.DirectoryStrategy; import org.apache.iotdb.commons.disk.strategy.MaxDiskUsableSpaceFirstStrategy; import org.apache.iotdb.commons.disk.strategy.MinFolderOccupiedSpaceFirstStrategy; import org.apache.iotdb.commons.disk.strategy.RandomOnDiskUsableSpaceStrategy; import org.apache.iotdb.commons.disk.strategy.SequenceStrategy; import org.apache.iotdb.commons.exception.DiskSpaceInsufficientException; +import org.apache.iotdb.commons.i18n.UtilMessages; import org.apache.iotdb.commons.utils.JVMCommonUtils; import org.apache.iotdb.db.utils.constant.TestConstant; +import ch.qos.logback.classic.Level; +import ch.qos.logback.classic.spi.ILoggingEvent; +import ch.qos.logback.core.read.ListAppender; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -34,6 +39,7 @@ import org.powermock.core.classloader.annotations.PowerMockIgnore; import org.powermock.core.classloader.annotations.PrepareForTest; import org.powermock.modules.junit4.PowerMockRunner; +import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.ArrayList; @@ -77,7 +83,9 @@ public void setUp() throws IOException { } @After - public void tearDown() {} + public void tearDown() { + MinFolderOccupiedSpaceFirstStrategy.resetCannotCalculateOccupiedSpaceLogTimes(); + } @Test public void testSequenceStrategy() throws DiskSpaceInsufficientException { @@ -146,6 +154,42 @@ public void testMinFolderOccupiedSpaceFirstStrategy() } } + @Test + public void testMinFolderOccupiedSpaceFirstStrategyLogThrottle() throws Exception { + MinFolderOccupiedSpaceFirstStrategy minFolderOccupiedSpaceFirstStrategy = + new MinFolderOccupiedSpaceFirstStrategy(); + minFolderOccupiedSpaceFirstStrategy.setFolders(dataDirList); + + ch.qos.logback.classic.Logger logger = + (ch.qos.logback.classic.Logger) LoggerFactory.getLogger(DirectoryStrategy.class); + Level previousLevel = logger.getLevel(); + logger.setLevel(Level.ERROR); + ListAppender appender = new ListAppender<>(); + appender.setContext(logger.getLoggerContext()); + appender.start(); + logger.addAppender(appender); + + try { + PowerMockito.doThrow(new IOException("broken occupied space")) + .when(JVMCommonUtils.class, "getOccupiedSpace", dataDirList.get(0)); + assertEquals(2, minFolderOccupiedSpaceFirstStrategy.nextFolderIndex()); + assertEquals(2, minFolderOccupiedSpaceFirstStrategy.nextFolderIndex()); + assertEquals(1, countLogEvents(appender, UtilMessages.CANNOT_CALCULATE_OCCUPIED_SPACE)); + + PowerMockito.doReturn(0L).when(JVMCommonUtils.class, "getOccupiedSpace", dataDirList.get(0)); + assertEquals(0, minFolderOccupiedSpaceFirstStrategy.nextFolderIndex()); + + PowerMockito.doThrow(new IOException("broken occupied space again")) + .when(JVMCommonUtils.class, "getOccupiedSpace", dataDirList.get(0)); + assertEquals(2, minFolderOccupiedSpaceFirstStrategy.nextFolderIndex()); + assertEquals(2, countLogEvents(appender, UtilMessages.CANNOT_CALCULATE_OCCUPIED_SPACE)); + } finally { + logger.detachAppender(appender); + logger.setLevel(previousLevel); + appender.stop(); + } + } + private int getIndexOfMinOccupiedSpace() throws IOException { int index = -1; long minOccupied = Long.MAX_VALUE; @@ -207,4 +251,14 @@ public void testAllDiskFull() { } catch (DiskSpaceInsufficientException e) { } } + + private long countLogEvents(ListAppender appender, String messagePattern) { + long count = 0; + for (ILoggingEvent event : appender.list) { + if (messagePattern.equals(event.getMessage())) { + count++; + } + } + return count; + } } diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImplLogTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImplLogTest.java new file mode 100644 index 0000000000000..64a7cb4945761 --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImplLogTest.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.protocol.thrift.impl; + +import org.apache.iotdb.commons.cluster.NodeStatus; +import org.apache.iotdb.commons.conf.CommonConfig; +import org.apache.iotdb.commons.conf.CommonDescriptor; +import org.apache.iotdb.db.conf.IoTDBConfig; +import org.apache.iotdb.db.conf.IoTDBDescriptor; + +import ch.qos.logback.classic.Level; +import ch.qos.logback.classic.spi.ILoggingEvent; +import ch.qos.logback.core.read.ListAppender; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.slf4j.LoggerFactory; + +public class DataNodeInternalRPCServiceImplLogTest { + + private static final CommonConfig COMMON_CONFIG = CommonDescriptor.getInstance().getConfig(); + private static final IoTDBConfig DATANODE_CONFIG = IoTDBDescriptor.getInstance().getConfig(); + private static final String DISK_FULL_WARNING_MESSAGE_PREFIX = + "The available disk space is : {}, "; + + private double previousDiskSpaceWarningThreshold; + private NodeStatus previousNodeStatus; + private String previousStatusReason; + private int previousDataNodeId; + + @Before + public void setUp() { + previousDiskSpaceWarningThreshold = COMMON_CONFIG.getDiskSpaceWarningThreshold(); + previousNodeStatus = COMMON_CONFIG.getNodeStatus(); + previousStatusReason = COMMON_CONFIG.getStatusReason(); + previousDataNodeId = DATANODE_CONFIG.getDataNodeId(); + + DATANODE_CONFIG.setDataNodeId(0); + COMMON_CONFIG.setDiskSpaceWarningThreshold(0.5); + COMMON_CONFIG.setNodeStatus(NodeStatus.Running); + COMMON_CONFIG.setStatusReason(null); + DataNodeInternalRPCServiceImpl.resetDiskFullWarningLogTime(); + } + + @After + public void tearDown() { + COMMON_CONFIG.setDiskSpaceWarningThreshold(previousDiskSpaceWarningThreshold); + COMMON_CONFIG.setNodeStatus(previousNodeStatus); + COMMON_CONFIG.setStatusReason(previousStatusReason); + DATANODE_CONFIG.setDataNodeId(previousDataNodeId); + DataNodeInternalRPCServiceImpl.resetDiskFullWarningLogTime(); + } + + @Test + public void diskFullWarningLoggedOnlyOnceUntilDiskRecovers() { + ch.qos.logback.classic.Logger logger = + (ch.qos.logback.classic.Logger) + LoggerFactory.getLogger(DataNodeInternalRPCServiceImpl.class); + Level previousLevel = logger.getLevel(); + logger.setLevel(Level.WARN); + ListAppender appender = new ListAppender<>(); + appender.setContext(logger.getLoggerContext()); + appender.start(); + logger.addAppender(appender); + + try { + DataNodeInternalRPCServiceImpl.updateDiskStatusAndMaybeLog(COMMON_CONFIG, 10.0, 100.0, 0.1); + DataNodeInternalRPCServiceImpl.updateDiskStatusAndMaybeLog(COMMON_CONFIG, 10.0, 100.0, 0.1); + + Assert.assertEquals(1, countLogEvents(appender, DISK_FULL_WARNING_MESSAGE_PREFIX)); + Assert.assertEquals(NodeStatus.ReadOnly, COMMON_CONFIG.getNodeStatus()); + Assert.assertEquals(NodeStatus.DISK_FULL, COMMON_CONFIG.getStatusReason()); + + DataNodeInternalRPCServiceImpl.updateDiskStatusAndMaybeLog(COMMON_CONFIG, 90.0, 100.0, 0.9); + Assert.assertEquals(NodeStatus.Running, COMMON_CONFIG.getNodeStatus()); + Assert.assertNull(COMMON_CONFIG.getStatusReason()); + + DataNodeInternalRPCServiceImpl.updateDiskStatusAndMaybeLog(COMMON_CONFIG, 10.0, 100.0, 0.1); + Assert.assertEquals(2, countLogEvents(appender, DISK_FULL_WARNING_MESSAGE_PREFIX)); + } finally { + logger.detachAppender(appender); + logger.setLevel(previousLevel); + appender.stop(); + } + } + + private long countLogEvents(ListAppender appender, String messagePrefix) { + return appender.list.stream() + .filter(event -> event.getMessage().startsWith(messagePrefix)) + .count(); + } +} diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/service/metrics/IoTDBInternalLocalReporterTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/service/metrics/IoTDBInternalLocalReporterTest.java new file mode 100644 index 0000000000000..f23a762226615 --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/service/metrics/IoTDBInternalLocalReporterTest.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.service.metrics; + +import org.junit.Assert; +import org.junit.Test; + +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +public class IoTDBInternalLocalReporterTest { + + @Test + public void sameFailureLoggedOnlyAfterIntervalOrRecovery() { + IoTDBInternalLocalReporter.FailureLogState failureLogState = + new IoTDBInternalLocalReporter.FailureLogState(); + long startTime = 1000L; + + Assert.assertTrue( + IoTDBInternalLocalReporter.shouldLogFailure(failureLogState, "failure-status", startTime)); + Assert.assertFalse( + IoTDBInternalLocalReporter.shouldLogFailure( + failureLogState, "failure-status", startTime + 1)); + + Assert.assertTrue( + IoTDBInternalLocalReporter.shouldLogFailure( + failureLogState, + "failure-status", + startTime + IoTDBInternalLocalReporter.FAILURE_LOG_INTERVAL)); + Assert.assertFalse( + IoTDBInternalLocalReporter.shouldLogFailure( + failureLogState, + "failure-status", + startTime + IoTDBInternalLocalReporter.FAILURE_LOG_INTERVAL + 1)); + + IoTDBInternalLocalReporter.clearFailureLogState(failureLogState); + Assert.assertTrue( + IoTDBInternalLocalReporter.shouldLogFailure( + failureLogState, "failure-status", startTime + 2)); + } + + @Test + public void differentFailuresLoggedImmediately() { + IoTDBInternalLocalReporter.FailureLogState failureLogState = + new IoTDBInternalLocalReporter.FailureLogState(); + long startTime = 1000L; + + Assert.assertTrue( + IoTDBInternalLocalReporter.shouldLogFailure( + failureLogState, "failure-status-1", startTime)); + Assert.assertFalse( + IoTDBInternalLocalReporter.shouldLogFailure( + failureLogState, "failure-status-1", startTime + 1)); + Assert.assertTrue( + IoTDBInternalLocalReporter.shouldLogFailure( + failureLogState, "failure-status-2", startTime + 2)); + } + + @Test + public void failureLogStateIsIsolatedByMetricPrefix() { + Map failureLogStateMap = + new ConcurrentHashMap<>(); + long startTime = 1000L; + + Assert.assertTrue( + IoTDBInternalLocalReporter.shouldLogFailure( + failureLogStateMap, "root.__system.metric1", "failure-status", startTime)); + Assert.assertTrue( + IoTDBInternalLocalReporter.shouldLogFailure( + failureLogStateMap, "root.__system.metric2", "failure-status", startTime + 1)); + Assert.assertFalse( + IoTDBInternalLocalReporter.shouldLogFailure( + failureLogStateMap, "root.__system.metric1", "failure-status", startTime + 2)); + + IoTDBInternalLocalReporter.clearFailureLogState(failureLogStateMap, "root.__system.metric2"); + Assert.assertTrue( + IoTDBInternalLocalReporter.shouldLogFailure( + failureLogStateMap, "root.__system.metric2", "failure-status", startTime + 3)); + Assert.assertFalse( + IoTDBInternalLocalReporter.shouldLogFailure( + failureLogStateMap, "root.__system.metric1", "failure-status", startTime + 4)); + } +} diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/generator/TsFileNameGeneratorLogTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/generator/TsFileNameGeneratorLogTest.java new file mode 100644 index 0000000000000..df17e2ebffa52 --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/generator/TsFileNameGeneratorLogTest.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.storageengine.dataregion.tsfile.generator; + +import org.apache.iotdb.commons.exception.DiskSpaceInsufficientException; +import org.apache.iotdb.db.i18n.StorageEngineMessages; + +import ch.qos.logback.classic.Level; +import ch.qos.logback.classic.spi.ILoggingEvent; +import ch.qos.logback.core.read.ListAppender; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.slf4j.LoggerFactory; + +import java.util.Arrays; + +public class TsFileNameGeneratorLogTest { + + @Before + public void setUp() { + TsFileNameGenerator.resetAllDisksFullLogTime(); + } + + @After + public void tearDown() { + TsFileNameGenerator.resetAllDisksFullLogTime(); + } + + @Test + public void allDisksFullTsFileDirErrorLoggedOnlyOnceUntilRecovery() { + ch.qos.logback.classic.Logger logger = + (ch.qos.logback.classic.Logger) LoggerFactory.getLogger(TsFileNameGenerator.class); + Level previousLevel = logger.getLevel(); + logger.setLevel(Level.ERROR); + ListAppender appender = new ListAppender<>(); + appender.setContext(logger.getLoggerContext()); + appender.start(); + logger.addAppender(appender); + + DiskSpaceInsufficientException exception = + new DiskSpaceInsufficientException(Arrays.asList("folder1", "folder2")); + try { + TsFileNameGenerator.logAllDisksFullCannotCreateTsFileDirIfNecessary(exception); + TsFileNameGenerator.logAllDisksFullCannotCreateTsFileDirIfNecessary(exception); + Assert.assertEquals( + 1, + countLogEvents(appender, StorageEngineMessages.ALL_DISKS_FULL_CANNOT_CREATE_TSFILE_DIR)); + + TsFileNameGenerator.resetAllDisksFullLogTime(); + TsFileNameGenerator.logAllDisksFullCannotCreateTsFileDirIfNecessary(exception); + Assert.assertEquals( + 2, + countLogEvents(appender, StorageEngineMessages.ALL_DISKS_FULL_CANNOT_CREATE_TSFILE_DIR)); + } finally { + logger.detachAppender(appender); + logger.setLevel(previousLevel); + appender.stop(); + } + } + + private long countLogEvents(ListAppender appender, String messagePattern) { + return appender.list.stream() + .filter(event -> messagePattern.equals(event.getMessage())) + .count(); + } +} diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/WALManagerLogTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/WALManagerLogTest.java new file mode 100644 index 0000000000000..edd68ef67e846 --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/WALManagerLogTest.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.storageengine.dataregion.wal; + +import org.apache.iotdb.db.conf.IoTDBConfig; +import org.apache.iotdb.db.conf.IoTDBDescriptor; + +import ch.qos.logback.classic.Level; +import ch.qos.logback.classic.spi.ILoggingEvent; +import ch.qos.logback.core.read.ListAppender; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.slf4j.LoggerFactory; + +public class WALManagerLogTest { + + private static final IoTDBConfig CONFIG = IoTDBDescriptor.getInstance().getConfig(); + private static final String WAL_THROTTLE_WARNING_MESSAGE = + "WAL disk usage {} is larger than the wal_throttle_threshold_in_byte * 0.8 {}, please check your write load, iot consensus and the pipe module. It's better to allocate more disk for WAL."; + + private long previousThrottleThreshold; + + @Before + public void setUp() { + previousThrottleThreshold = CONFIG.getThrottleThreshold(); + CONFIG.setThrottleThreshold(100); + WALManager.getInstance().clear(); + WALManager.getInstance().resetWalThrottleWarningLogTime(); + } + + @After + public void tearDown() { + WALManager.getInstance().clear(); + WALManager.getInstance().resetWalThrottleWarningLogTime(); + CONFIG.setThrottleThreshold(previousThrottleThreshold); + } + + @Test + public void walThrottleWarningLoggedOnlyOnceUntilUsageRecovers() { + ch.qos.logback.classic.Logger logger = + (ch.qos.logback.classic.Logger) LoggerFactory.getLogger(WALManager.class); + Level previousLevel = logger.getLevel(); + logger.setLevel(Level.WARN); + ListAppender appender = new ListAppender<>(); + appender.setContext(logger.getLoggerContext()); + appender.start(); + logger.addAppender(appender); + + WALManager walManager = WALManager.getInstance(); + try { + long throttleThreshold = walManager.getThrottleThreshold(); + walManager.addTotalDiskUsage(throttleThreshold * 2); + walManager.logWalThrottleWarningIfNecessary(); + walManager.logWalThrottleWarningIfNecessary(); + + Assert.assertEquals(1, countLogEvents(appender, WAL_THROTTLE_WARNING_MESSAGE)); + + walManager.subtractTotalDiskUsage(throttleThreshold / 2); + walManager.logWalThrottleWarningIfNecessary(); + Assert.assertEquals(1, countLogEvents(appender, WAL_THROTTLE_WARNING_MESSAGE)); + + walManager.subtractTotalDiskUsage(throttleThreshold + throttleThreshold / 2); + walManager.logWalThrottleWarningIfNecessary(); + Assert.assertEquals(1, countLogEvents(appender, WAL_THROTTLE_WARNING_MESSAGE)); + + walManager.addTotalDiskUsage(throttleThreshold); + walManager.logWalThrottleWarningIfNecessary(); + Assert.assertEquals(2, countLogEvents(appender, WAL_THROTTLE_WARNING_MESSAGE)); + } finally { + logger.detachAppender(appender); + logger.setLevel(previousLevel); + appender.stop(); + } + } + + private long countLogEvents(ListAppender appender, String messagePattern) { + return appender.list.stream() + .filter(event -> messagePattern.equals(event.getMessage())) + .count(); + } +} diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/allocation/FirstCreateStrategyTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/allocation/FirstCreateStrategyTest.java index dc614364a4ecc..5b4e3852c30c1 100644 --- a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/allocation/FirstCreateStrategyTest.java +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/allocation/FirstCreateStrategyTest.java @@ -18,11 +18,14 @@ */ package org.apache.iotdb.db.storageengine.dataregion.wal.allocation; +import org.apache.iotdb.commons.cluster.NodeStatus; import org.apache.iotdb.commons.conf.CommonConfig; import org.apache.iotdb.commons.conf.CommonDescriptor; import org.apache.iotdb.commons.exception.IllegalPathException; import org.apache.iotdb.commons.path.PartialPath; import org.apache.iotdb.commons.queryengine.plan.planner.plan.node.PlanNodeId; +import org.apache.iotdb.commons.utils.JVMCommonUtils; +import org.apache.iotdb.db.i18n.StorageEngineMessages; import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertRowNode; import org.apache.iotdb.db.storageengine.dataregion.wal.node.IWALNode; import org.apache.iotdb.db.storageengine.dataregion.wal.node.WALNode; @@ -30,6 +33,9 @@ import org.apache.iotdb.db.utils.EnvironmentUtils; import org.apache.iotdb.db.utils.constant.TestConstant; +import ch.qos.logback.classic.Level; +import ch.qos.logback.classic.spi.ILoggingEvent; +import ch.qos.logback.core.read.ListAppender; import org.apache.tsfile.common.conf.TSFileConfig; import org.apache.tsfile.enums.TSDataType; import org.apache.tsfile.utils.Binary; @@ -37,6 +43,7 @@ import org.junit.After; import org.junit.Before; import org.junit.Test; +import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; @@ -57,12 +64,19 @@ public class FirstCreateStrategyTest { TestConstant.BASE_OUTPUT_PATH.concat("wal_test3") }; private String[] prevWalDirs; + private NodeStatus previousNodeStatus; + private String previousStatusReason; + private double previousDiskSpaceWarningThreshold; @Before public void setUp() throws Exception { prevWalDirs = commonConfig.getWalDirs(); + previousNodeStatus = commonConfig.getNodeStatus(); + previousStatusReason = commonConfig.getStatusReason(); + previousDiskSpaceWarningThreshold = commonConfig.getDiskSpaceWarningThreshold(); EnvironmentUtils.envSetUp(); commonConfig.setWalDirs(walDirs); + setDiskSpaceWarningThresholdForTest(previousDiskSpaceWarningThreshold); } @After @@ -72,6 +86,9 @@ public void tearDown() throws Exception { EnvironmentUtils.cleanDir(walDir); } commonConfig.setWalDirs(prevWalDirs); + setDiskSpaceWarningThresholdForTest(previousDiskSpaceWarningThreshold); + commonConfig.setNodeStatus(previousNodeStatus); + commonConfig.setStatusReason(previousStatusReason); } @Test @@ -222,6 +239,47 @@ public void testReInitializeAfterDiskSpaceCleaned() throws IllegalPathException, } } + @Test + public void testFailToCreateWalNodeDisksFullLoggedOnlyOnceUntilRecovery() { + ch.qos.logback.classic.Logger logger = + (ch.qos.logback.classic.Logger) + LoggerFactory.getLogger(AbstractNodeAllocationStrategy.class); + Level previousLevel = logger.getLevel(); + logger.setLevel(Level.ERROR); + ListAppender appender = new ListAppender<>(); + appender.setContext(logger.getLoggerContext()); + appender.start(); + logger.addAppender(appender); + + FirstCreateStrategy strategy = null; + try { + setDiskSpaceWarningThresholdForTest(1.1); + strategy = new FirstCreateStrategy(); + strategy.applyForWALNode("failed-1"); + strategy.applyForWALNode("failed-2"); + + assertEquals( + 1, countLogEvents(appender, StorageEngineMessages.FAIL_TO_CREATE_WAL_NODE_DISKS_FULL)); + + setDiskSpaceWarningThresholdForTest(-1.0); + IWALNode walNode = strategy.applyForWALNode("success"); + assertTrue(walNode instanceof WALNode); + + strategy.folderManager = null; + setDiskSpaceWarningThresholdForTest(1.1); + strategy.applyForWALNode("failed-3"); + assertEquals( + 2, countLogEvents(appender, StorageEngineMessages.FAIL_TO_CREATE_WAL_NODE_DISKS_FULL)); + } finally { + if (strategy != null) { + strategy.clear(); + } + logger.detachAppender(appender); + logger.setLevel(previousLevel); + appender.stop(); + } + } + private InsertRowNode getInsertRowNode() throws IllegalPathException { long time = 110L; TSDataType[] dataTypes = @@ -263,4 +321,15 @@ private InsertRowNode getInsertRowNode() throws IllegalPathException { columns, true); } + + private void setDiskSpaceWarningThresholdForTest(double threshold) { + commonConfig.setDiskSpaceWarningThreshold(threshold); + JVMCommonUtils.setDiskSpaceWarningThreshold(threshold); + } + + private long countLogEvents(ListAppender appender, String messagePattern) { + return appender.list.stream() + .filter(event -> messagePattern.equals(event.getMessage())) + .count(); + } } diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/WALNodeTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/WALNodeTest.java index e3d1efb4edd19..ff5ae7d658215 100644 --- a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/WALNodeTest.java +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/WALNodeTest.java @@ -333,7 +333,7 @@ public void testConcurrentCheckpoint() throws Exception { @Test public void testDeleteOutdatedFiles() throws Exception { List walFlushListeners = new ArrayList<>(); - // write until log is rolled + // Write entries, then roll explicitly so the next active memTable starts in a new WAL file. long time = 0; IMemTable memTable = new PrimitiveMemTable(databasePath, dataRegionId); long memTableId = memTable.getMemTableId(); @@ -352,7 +352,7 @@ public void testDeleteOutdatedFiles() throws Exception { .setDataRegion( new DataRegionId(1), new DataRegionTest.DummyDataRegion(logDirectory, databasePath)); walNode.onMemTableCreated(memTable, tsFilePath); - while (time < 20000) { + while (time < 10) { ++time; InsertTabletNode insertTabletNode = getInsertTabletNode(devicePath + memTableId, new long[] {time}); @@ -364,6 +364,7 @@ public void testDeleteOutdatedFiles() throws Exception { walFlushListeners.add(walFlushListener); } walNode.onMemTableFlushed(memTable); + walNode.rollWALFile(); walNode.onMemTableCreated(new PrimitiveMemTable(databasePath, dataRegionId), tsFilePath); Awaitility.await().until(() -> walNode.isAllWALEntriesConsumed()); // check existence of _0-0-0.wal file and _1-0-1.wal file diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/load/active/ActiveLoadScheduledExecutorServiceLogTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/load/active/ActiveLoadScheduledExecutorServiceLogTest.java new file mode 100644 index 0000000000000..f3256f314c6f9 --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/load/active/ActiveLoadScheduledExecutorServiceLogTest.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.storageengine.load.active; + +import org.apache.iotdb.commons.concurrent.WrappedRunnable; +import org.apache.iotdb.db.i18n.StorageEngineMessages; + +import ch.qos.logback.classic.Level; +import ch.qos.logback.classic.spi.ILoggingEvent; +import ch.qos.logback.core.read.ListAppender; +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.LoggerFactory; + +import java.util.concurrent.atomic.AtomicBoolean; + +public class ActiveLoadScheduledExecutorServiceLogTest { + + @Test + public void activeLoadJobFailureLoggedOnlyOnceUntilSuccess() { + ch.qos.logback.classic.Logger logger = + (ch.qos.logback.classic.Logger) + LoggerFactory.getLogger(ActiveLoadScheduledExecutorService.class); + Level previousLevel = logger.getLevel(); + logger.setLevel(Level.WARN); + ListAppender appender = new ListAppender<>(); + appender.setContext(logger.getLoggerContext()); + appender.start(); + logger.addAppender(appender); + + AtomicBoolean shouldFail = new AtomicBoolean(true); + WrappedRunnable job = + ActiveLoadScheduledExecutorService.wrapRunnableWithFailureLogThrottle( + () -> { + if (shouldFail.get()) { + throw new RuntimeException("active load job failure"); + } + }); + + try { + job.run(); + job.run(); + Assert.assertEquals( + 1, countLogEvents(appender, StorageEngineMessages.ERROR_EXECUTING_ACTIVE_LOAD_JOB)); + + shouldFail.set(false); + job.run(); + + shouldFail.set(true); + job.run(); + Assert.assertEquals( + 2, countLogEvents(appender, StorageEngineMessages.ERROR_EXECUTING_ACTIVE_LOAD_JOB)); + } finally { + logger.detachAppender(appender); + logger.setLevel(previousLevel); + appender.stop(); + } + } + + private long countLogEvents(ListAppender appender, String messagePattern) { + return appender.list.stream() + .filter(event -> messagePattern.equals(event.getMessage())) + .count(); + } +} diff --git a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/disk/LinuxDiskMetricsManager.java b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/disk/LinuxDiskMetricsManager.java index cb8ea23a8216b..83511a7251c04 100644 --- a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/disk/LinuxDiskMetricsManager.java +++ b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/disk/LinuxDiskMetricsManager.java @@ -30,11 +30,12 @@ import java.nio.file.Files; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; import java.util.Map; -import java.util.Objects; import java.util.Scanner; import java.util.Set; +import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; /** @@ -54,14 +55,17 @@ public class LinuxDiskMetricsManager implements IDiskMetricsManager { private static final Logger LOGGER = LoggerFactory.getLogger(LinuxDiskMetricsManager.class); @SuppressWarnings("squid:S1075") - private static final String DISK_STATUS_FILE_PATH = "/proc/diskstats"; + private static final String DEFAULT_DISK_STATUS_FILE_PATH = "/proc/diskstats"; @SuppressWarnings("squid:S1075") - private static final String DISK_ID_PATH = "/sys/block"; + private static final String DEFAULT_DISK_ID_PATH = "/sys/block"; @SuppressWarnings("squid:S1075") - private static final String DISK_SECTOR_SIZE_PATH = "/sys/block/%s/queue/hw_sector_size"; + private static final String DEFAULT_DISK_SECTOR_SIZE_PATH = "/sys/block/%s/queue/hw_sector_size"; + private final String diskStatusFilePath; + private final String diskIdPath; + private final String diskSectorSizePath; private final String processIoStatusPath; private static final int DISK_ID_OFFSET = 3; private static final int DISK_READ_COUNT_OFFSET = 4; @@ -77,10 +81,13 @@ public class LinuxDiskMetricsManager implements IDiskMetricsManager { private static final int DEFAULT_SECTOR_SIZE = 512; private static final double BYTES_PER_KB = 1024.0; private static final long UPDATE_SMALLEST_INTERVAL = 10000L; + private static final long FAILURE_LOG_INTERVAL = TimeUnit.MINUTES.toMillis(5); private Set diskIdSet; private final Map diskSectorSizeMap; private long lastUpdateTime = 0L; private long updateInterval = 1L; + private final FailureLogState diskInfoFailureLogState = new FailureLogState(); + private final FailureLogState processInfoFailureLogState = new FailureLogState(); // Disk IO status structure private final Map lastReadOperationCountForDisk; @@ -113,9 +120,23 @@ public class LinuxDiskMetricsManager implements IDiskMetricsManager { private long lastWriteOpsCountForProcess = 0L; public LinuxDiskMetricsManager() { - processIoStatusPath = + this( + DEFAULT_DISK_STATUS_FILE_PATH, + DEFAULT_DISK_ID_PATH, + DEFAULT_DISK_SECTOR_SIZE_PATH, String.format( - "/proc/%s/io", MetricConfigDescriptor.getInstance().getMetricConfig().getPid()); + "/proc/%s/io", MetricConfigDescriptor.getInstance().getMetricConfig().getPid())); + } + + LinuxDiskMetricsManager( + String diskStatusFilePath, + String diskIdPath, + String diskSectorSizePath, + String processIoStatusPath) { + this.diskStatusFilePath = diskStatusFilePath; + this.diskIdPath = diskIdPath; + this.diskSectorSizePath = diskSectorSizePath; + this.processIoStatusPath = processIoStatusPath; collectDiskId(); // leave one entry to avoid hashmap resizing diskSectorSizeMap = new HashMap<>(diskIdSet.size() + 1, 1); @@ -324,12 +345,18 @@ public Set getDiskIds() { } private void collectDiskId() { - File diskIdFolder = new File(DISK_ID_PATH); + File diskIdFolder = new File(diskIdPath); if (!diskIdFolder.exists()) { + diskIdSet = Collections.emptySet(); + return; + } + File[] diskIdFiles = diskIdFolder.listFiles(); + if (diskIdFiles == null) { + diskIdSet = Collections.emptySet(); return; } diskIdSet = - new ArrayList<>(Arrays.asList(Objects.requireNonNull(diskIdFolder.listFiles()))) + new ArrayList<>(Arrays.asList(diskIdFiles)) .stream() .filter(x -> !x.getName().startsWith("loop") && !x.getName().startsWith("ram")) .map(File::getName) @@ -338,8 +365,8 @@ private void collectDiskId() { private void collectDiskInfo() { for (String diskId : diskIdSet) { - String diskSectorSizePath = String.format(DISK_SECTOR_SIZE_PATH, diskId); - File diskSectorSizeFile = new File(diskSectorSizePath); + String currentDiskSectorSizePath = String.format(diskSectorSizePath, diskId); + File diskSectorSizeFile = new File(currentDiskSectorSizePath); try (Scanner scanner = new Scanner(Files.newInputStream(diskSectorSizeFile.toPath()))) { if (scanner.hasNext()) { int sectorSize = Integer.parseInt(scanner.nextLine()); @@ -365,9 +392,9 @@ private void updateInfo() { } private void updateDiskInfo() { - File diskStatsFile = new File(DISK_STATUS_FILE_PATH); + File diskStatsFile = new File(diskStatusFilePath); if (!diskStatsFile.exists()) { - LOGGER.warn(MetricsMessages.CANNOT_FIND_DISK_IO_STATUS_FILE, DISK_STATUS_FILE_PATH); + logCannotFindDiskIoStatusFileIfNecessary(); return; } @@ -419,8 +446,9 @@ private void updateDiskInfo() { diskId, diskInfo, offsetArray[index], lastMapArray[index], incrementMapArray[index]); } } + clearFailureLogState(diskInfoFailureLogState); } catch (IOException e) { - LOGGER.error(MetricsMessages.ERROR_UPDATING_DISK_IO_INFO, e); + logErrorUpdatingDiskIoInfoIfNecessary(e); } } @@ -445,7 +473,8 @@ private void updateSingleDiskInfo( private void updateProcessInfo() { File processStatInfoFile = new File(processIoStatusPath); if (!processStatInfoFile.exists()) { - LOGGER.warn(MetricsMessages.CANNOT_FIND_PROCESS_IO_STATUS_FILE, processIoStatusPath); + logCannotFindProcessIoStatusFileIfNecessary(); + return; } try (Scanner processStatsScanner = @@ -466,11 +495,66 @@ private void updateProcessInfo() { lastAttemptWriteSizeForProcess = Long.parseLong(infoLine.split(":\\s")[1]); } } + clearFailureLogState(processInfoFailureLogState); } catch (IOException e) { + logErrorUpdatingProcessIoInfoIfNecessary(e); + } + } + + private void logCannotFindDiskIoStatusFileIfNecessary() { + if (shouldLogFailure( + diskInfoFailureLogState, + MetricsMessages.CANNOT_FIND_DISK_IO_STATUS_FILE + diskStatusFilePath)) { + LOGGER.warn(MetricsMessages.CANNOT_FIND_DISK_IO_STATUS_FILE, diskStatusFilePath); + } + } + + private void logErrorUpdatingDiskIoInfoIfNecessary(IOException e) { + if (shouldLogFailure(diskInfoFailureLogState, MetricsMessages.ERROR_UPDATING_DISK_IO_INFO)) { + LOGGER.error(MetricsMessages.ERROR_UPDATING_DISK_IO_INFO, e); + } + } + + private void logCannotFindProcessIoStatusFileIfNecessary() { + if (shouldLogFailure( + processInfoFailureLogState, + MetricsMessages.CANNOT_FIND_PROCESS_IO_STATUS_FILE + processIoStatusPath)) { + LOGGER.warn(MetricsMessages.CANNOT_FIND_PROCESS_IO_STATUS_FILE, processIoStatusPath); + } + } + + private void logErrorUpdatingProcessIoInfoIfNecessary(IOException e) { + if (shouldLogFailure( + processInfoFailureLogState, MetricsMessages.ERROR_UPDATING_PROCESS_IO_INFO)) { LOGGER.error(MetricsMessages.ERROR_UPDATING_PROCESS_IO_INFO, e); } } + private static boolean shouldLogFailure(FailureLogState failureLogState, String failureMessage) { + synchronized (failureLogState) { + long currentTime = System.currentTimeMillis(); + if (!failureMessage.equals(failureLogState.lastFailure) + || currentTime >= failureLogState.nextLogTime) { + failureLogState.lastFailure = failureMessage; + failureLogState.nextLogTime = currentTime + FAILURE_LOG_INTERVAL; + return true; + } + return false; + } + } + + private static void clearFailureLogState(FailureLogState failureLogState) { + synchronized (failureLogState) { + failureLogState.lastFailure = ""; + failureLogState.nextLogTime = 0L; + } + } + + private static class FailureLogState { + private long nextLogTime = 0L; + private String lastFailure = ""; + } + private void checkUpdate() { if (System.currentTimeMillis() - lastUpdateTime > UPDATE_SMALLEST_INTERVAL) { updateInfo(); diff --git a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/disk/WindowsDiskMetricsManager.java b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/disk/WindowsDiskMetricsManager.java index 4e717190eb761..90c58ebf0fc84 100644 --- a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/disk/WindowsDiskMetricsManager.java +++ b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/disk/WindowsDiskMetricsManager.java @@ -93,6 +93,12 @@ public class WindowsDiskMetricsManager implements IDiskMetricsManager { private long nextPowerShellRetryTime = 0L; private long nextFailureLogTime = 0L; private String lastPowerShellFailure = ""; + private final FailureLogState diskFormatFailureLogState = new FailureLogState(); + private final FailureLogState processFormatFailureLogState = new FailureLogState(); + private final FailureLogState longParseFailureLogState = new FailureLogState(); + private final FailureLogState doubleParseFailureLogState = new FailureLogState(); + private boolean hasLongParseFailureInCurrentUpdate = false; + private boolean hasDoubleParseFailureInCurrentUpdate = false; private final Map lastReadOperationCountForDisk = new HashMap<>(); private final Map lastWriteOperationCountForDisk = new HashMap<>(); @@ -265,8 +271,16 @@ private void updateInfo() { long currentTime = System.currentTimeMillis(); updateInterval = lastUpdateTime == 0L ? 0L : currentTime - lastUpdateTime; lastUpdateTime = currentTime; + hasLongParseFailureInCurrentUpdate = false; + hasDoubleParseFailureInCurrentUpdate = false; updateDiskInfo(); updateProcessInfo(); + if (!hasLongParseFailureInCurrentUpdate) { + clearFailureLogState(longParseFailureLogState); + } + if (!hasDoubleParseFailureInCurrentUpdate) { + clearFailureLogState(doubleParseFailureLogState); + } } private void updateDiskInfo() { @@ -358,9 +372,10 @@ private void updateProcessInfo() { String[] processMetricArray = processInfo.split("\t"); if (processMetricArray.length < 4) { - LOGGER.warn(MetricsMessages.UNEXPECTED_WINDOWS_PROCESS_IO_FORMAT, processInfo); + logUnexpectedWindowsProcessIoFormatIfNecessary(processInfo); return; } + clearFailureLogState(processFormatFailureLogState); long readOpsPerSec = parseLong(processMetricArray[0]); long writeOpsPerSec = parseLong(processMetricArray[1]); @@ -383,13 +398,15 @@ private void updateProcessInfo() { private Map queryDiskInfo() { Map result = new HashMap<>(); + boolean hasUnexpectedFormat = false; for (String line : executePowerShell(DISK_QUERY)) { if (line == null || line.isEmpty()) { continue; } String[] values = line.split("\t"); if (values.length < 9) { - LOGGER.warn(MetricsMessages.UNEXPECTED_WINDOWS_DISK_IO_FORMAT, line); + hasUnexpectedFormat = true; + logUnexpectedWindowsDiskIoFormatIfNecessary(line); continue; } String diskId = values[0].trim(); @@ -400,6 +417,9 @@ private Map queryDiskInfo() { System.arraycopy(values, 1, metricArray, 0, metricArray.length); result.put(diskId, metricArray); } + if (!hasUnexpectedFormat) { + clearFailureLogState(diskFormatFailureLogState); + } return result; } @@ -446,7 +466,8 @@ private long parseLong(String value) { try { return Math.round(Double.parseDouble(value.trim())); } catch (NumberFormatException e) { - LOGGER.warn(MetricsMessages.FAILED_TO_PARSE_LONG_WINDOWS_DISK, value, e); + hasLongParseFailureInCurrentUpdate = true; + logFailedToParseLongWindowsDiskIfNecessary(value, e); return 0L; } } @@ -455,7 +476,8 @@ private double parseDouble(String value) { try { return Double.parseDouble(value.trim()); } catch (NumberFormatException e) { - LOGGER.warn(MetricsMessages.FAILED_TO_PARSE_DOUBLE_WINDOWS_DISK, value, e); + hasDoubleParseFailureInCurrentUpdate = true; + logFailedToParseDoubleWindowsDiskIfNecessary(value, e); return 0.0; } } @@ -548,6 +570,34 @@ private void handlePowerShellFailure( } } + private void logUnexpectedWindowsDiskIoFormatIfNecessary(String line) { + if (shouldLogFailure( + diskFormatFailureLogState, MetricsMessages.UNEXPECTED_WINDOWS_DISK_IO_FORMAT)) { + LOGGER.warn(MetricsMessages.UNEXPECTED_WINDOWS_DISK_IO_FORMAT, line); + } + } + + private void logUnexpectedWindowsProcessIoFormatIfNecessary(String processInfo) { + if (shouldLogFailure( + processFormatFailureLogState, MetricsMessages.UNEXPECTED_WINDOWS_PROCESS_IO_FORMAT)) { + LOGGER.warn(MetricsMessages.UNEXPECTED_WINDOWS_PROCESS_IO_FORMAT, processInfo); + } + } + + private void logFailedToParseLongWindowsDiskIfNecessary(String value, NumberFormatException e) { + if (shouldLogFailure( + longParseFailureLogState, MetricsMessages.FAILED_TO_PARSE_LONG_WINDOWS_DISK)) { + LOGGER.warn(MetricsMessages.FAILED_TO_PARSE_LONG_WINDOWS_DISK, value, e); + } + } + + private void logFailedToParseDoubleWindowsDiskIfNecessary(String value, NumberFormatException e) { + if (shouldLogFailure( + doubleParseFailureLogState, MetricsMessages.FAILED_TO_PARSE_DOUBLE_WINDOWS_DISK)) { + LOGGER.warn(MetricsMessages.FAILED_TO_PARSE_DOUBLE_WINDOWS_DISK, value, e); + } + } + private boolean shouldLogFailure(long currentTime, String failureMessage) { if (!failureMessage.equals(lastPowerShellFailure) || currentTime >= nextFailureLogTime) { lastPowerShellFailure = failureMessage; @@ -557,6 +607,26 @@ private boolean shouldLogFailure(long currentTime, String failureMessage) { return false; } + private static boolean shouldLogFailure(FailureLogState failureLogState, String failureMessage) { + synchronized (failureLogState) { + long currentTime = System.currentTimeMillis(); + if (!failureMessage.equals(failureLogState.lastFailure) + || currentTime >= failureLogState.nextLogTime) { + failureLogState.lastFailure = failureMessage; + failureLogState.nextLogTime = currentTime + FAILURE_LOG_INTERVAL; + return true; + } + return false; + } + } + + private static void clearFailureLogState(FailureLogState failureLogState) { + synchronized (failureLogState) { + failureLogState.lastFailure = ""; + failureLogState.nextLogTime = 0L; + } + } + private void clearPowerShellFailure() { if (!lastPowerShellFailure.isEmpty() || nextPowerShellRetryTime > 0L) { LOGGER.info("Recovered windows disk metrics collection through PowerShell/CIM."); @@ -583,6 +653,11 @@ private static Charset getWindowsShellCharset() { return Charset.defaultCharset(); } + private static class FailureLogState { + private long nextLogTime = 0L; + private String lastFailure = ""; + } + private static String resolvePowerShellExecutable() { String systemRoot = System.getenv("SystemRoot"); if (systemRoot == null || systemRoot.isEmpty()) { diff --git a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/net/LinuxNetMetricManager.java b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/net/LinuxNetMetricManager.java index 77e950e704d29..3cc9ee3df5976 100644 --- a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/net/LinuxNetMetricManager.java +++ b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/net/LinuxNetMetricManager.java @@ -42,6 +42,7 @@ import java.util.Objects; import java.util.Scanner; import java.util.Set; +import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; public class LinuxNetMetricManager implements INetMetricManager { @@ -55,6 +56,7 @@ public class LinuxNetMetricManager implements INetMetricManager { private static final String BYTES = "bytes"; private static final String PACKETS = "packets"; + private static final long FAILURE_LOG_INTERVAL = TimeUnit.MINUTES.toMillis(5); private static final int IFACE_NAME_INDEX = 0; @@ -91,6 +93,8 @@ public LinuxNetMetricManager() { } private long lastUpdateTime = 0L; + private final FailureLogState netStatusFailureLogState = new FailureLogState(); + private final FailureLogState socketNumFailureLogState = new FailureLogState(); @Override public Set getIfaceSet() { @@ -213,8 +217,9 @@ private void updateNetStatus() { long transmittedPackets = Long.parseLong(statusInfoAsList.get(transmittedPacketsIndex)); transmittedPacketsMapForIface.put(iface, transmittedPackets); } + clearFailureLogState(netStatusFailureLogState); } catch (IOException e) { - LOGGER.error(MetricsMessages.READ_NET_STATUS_FOR_NET_ERROR, NET_STATUS_PATH, e); + logReadNetStatusForNetErrorIfNecessary(e); } if (MetricLevel.higherOrEqual(MetricLevel.NORMAL, METRIC_CONFIG.getMetricLevel())) { @@ -232,13 +237,14 @@ private void updateNetStatus() { } process.waitFor(); this.connectionNum = Integer.parseInt(result.toString().trim()); + clearFailureLogState(socketNumFailureLogState); } catch (IOException e) { - LOGGER.error(MetricsMessages.FAILED_TO_GET_SOCKET_NUM, e); + logFailedToGetSocketNumIfNecessary(e); } catch (InterruptedException e) { - LOGGER.error(MetricsMessages.INTERRUPTED_WHILE_WAITING_SOCKET_NUM, e); + logInterruptedWhileWaitingSocketNumIfNecessary(e); Thread.currentThread().interrupt(); } catch (NumberFormatException e) { - LOGGER.error(MetricsMessages.FAILED_TO_PARSE_SOCKET_NUM, e.getMessage()); + logFailedToParseSocketNumIfNecessary(e); } finally { if (process != null && process.isAlive()) { process.destroyForcibly(); @@ -246,4 +252,55 @@ private void updateNetStatus() { } } } + + private void logReadNetStatusForNetErrorIfNecessary(IOException e) { + if (shouldLogFailure(netStatusFailureLogState, MetricsMessages.READ_NET_STATUS_FOR_NET_ERROR)) { + LOGGER.error(MetricsMessages.READ_NET_STATUS_FOR_NET_ERROR, NET_STATUS_PATH, e); + } + } + + private void logFailedToGetSocketNumIfNecessary(IOException e) { + if (shouldLogFailure(socketNumFailureLogState, MetricsMessages.FAILED_TO_GET_SOCKET_NUM)) { + LOGGER.error(MetricsMessages.FAILED_TO_GET_SOCKET_NUM, e); + } + } + + private void logInterruptedWhileWaitingSocketNumIfNecessary(InterruptedException e) { + if (shouldLogFailure( + socketNumFailureLogState, MetricsMessages.INTERRUPTED_WHILE_WAITING_SOCKET_NUM)) { + LOGGER.error(MetricsMessages.INTERRUPTED_WHILE_WAITING_SOCKET_NUM, e); + } + } + + private void logFailedToParseSocketNumIfNecessary(NumberFormatException e) { + if (shouldLogFailure( + socketNumFailureLogState, MetricsMessages.FAILED_TO_PARSE_SOCKET_NUM + e.getMessage())) { + LOGGER.error(MetricsMessages.FAILED_TO_PARSE_SOCKET_NUM, e.getMessage()); + } + } + + private static boolean shouldLogFailure(FailureLogState failureLogState, String failureMessage) { + synchronized (failureLogState) { + long currentTime = System.currentTimeMillis(); + if (!failureMessage.equals(failureLogState.lastFailure) + || currentTime >= failureLogState.nextLogTime) { + failureLogState.lastFailure = failureMessage; + failureLogState.nextLogTime = currentTime + FAILURE_LOG_INTERVAL; + return true; + } + return false; + } + } + + private static void clearFailureLogState(FailureLogState failureLogState) { + synchronized (failureLogState) { + failureLogState.lastFailure = ""; + failureLogState.nextLogTime = 0L; + } + } + + private static class FailureLogState { + private long nextLogTime = 0L; + private String lastFailure = ""; + } } diff --git a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/net/WindowsNetMetricManager.java b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/net/WindowsNetMetricManager.java index 21ed108ebde50..0a4addc844f33 100644 --- a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/net/WindowsNetMetricManager.java +++ b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/net/WindowsNetMetricManager.java @@ -36,14 +36,19 @@ import java.util.HashSet; import java.util.Map; import java.util.Set; +import java.util.concurrent.TimeUnit; public class WindowsNetMetricManager implements INetMetricManager { private static final Logger LOGGER = LoggerFactory.getLogger(WindowsNetMetricManager.class); + private static final long FAILURE_LOG_INTERVAL = TimeUnit.MINUTES.toMillis(5); private static final MetricConfig METRIC_CONFIG = MetricConfigDescriptor.getInstance().getMetricConfig(); private long lastUpdateTime = 0L; + private final FailureLogState interfacesFailureLogState = new FailureLogState(); + private final FailureLogState statisticsFailureLogState = new FailureLogState(); + private final FailureLogState connectionNumFailureLogState = new FailureLogState(); private Set ifaceSet = new HashSet<>(); @@ -131,13 +136,15 @@ private void updateInterfaces() { } int exitCode = process.waitFor(); if (exitCode != 0) { - LOGGER.error(MetricsMessages.GET_INTERFACES_FAILED, exitCode); + logGetInterfacesFailedIfNecessary(exitCode); + } else { + clearFailureLogState(interfacesFailureLogState); } } catch (IOException | InterruptedException e) { if (e instanceof InterruptedException) { Thread.currentThread().interrupt(); } - LOGGER.error(MetricsMessages.UPDATE_INTERFACES_ERROR, e); + logUpdateInterfacesErrorIfNecessary(e); ifaceSet.clear(); } } @@ -178,13 +185,15 @@ private void updateStatistics() { } int exitCode = process.waitFor(); if (exitCode != 0) { - LOGGER.error(MetricsMessages.GET_STATISTICS_FAILED, exitCode); + logGetStatisticsFailedIfNecessary(exitCode); + } else { + clearFailureLogState(statisticsFailureLogState); } } catch (IOException | InterruptedException | NumberFormatException e) { if (e instanceof InterruptedException) { Thread.currentThread().interrupt(); } - LOGGER.error(MetricsMessages.UPDATE_STATISTICS_ERROR, e); + logUpdateStatisticsErrorIfNecessary(e); } } @@ -206,13 +215,80 @@ private void updateConnectionNum() { this.connectionNum = count; int exitCode = process.waitFor(); if (exitCode != 0) { - LOGGER.error(MetricsMessages.GET_CONNECTION_NUM_FAILED, exitCode); + logGetConnectionNumFailedIfNecessary(exitCode); + } else { + clearFailureLogState(connectionNumFailureLogState); } } catch (IOException | InterruptedException e) { if (e instanceof InterruptedException) { Thread.currentThread().interrupt(); } + logUpdateConnectionNumErrorIfNecessary(e); + } + } + + private void logGetInterfacesFailedIfNecessary(int exitCode) { + if (shouldLogFailure( + interfacesFailureLogState, MetricsMessages.GET_INTERFACES_FAILED + exitCode)) { + LOGGER.error(MetricsMessages.GET_INTERFACES_FAILED, exitCode); + } + } + + private void logUpdateInterfacesErrorIfNecessary(Exception e) { + if (shouldLogFailure(interfacesFailureLogState, MetricsMessages.UPDATE_INTERFACES_ERROR)) { + LOGGER.error(MetricsMessages.UPDATE_INTERFACES_ERROR, e); + } + } + + private void logGetStatisticsFailedIfNecessary(int exitCode) { + if (shouldLogFailure( + statisticsFailureLogState, MetricsMessages.GET_STATISTICS_FAILED + exitCode)) { + LOGGER.error(MetricsMessages.GET_STATISTICS_FAILED, exitCode); + } + } + + private void logUpdateStatisticsErrorIfNecessary(Exception e) { + if (shouldLogFailure(statisticsFailureLogState, MetricsMessages.UPDATE_STATISTICS_ERROR)) { + LOGGER.error(MetricsMessages.UPDATE_STATISTICS_ERROR, e); + } + } + + private void logGetConnectionNumFailedIfNecessary(int exitCode) { + if (shouldLogFailure( + connectionNumFailureLogState, MetricsMessages.GET_CONNECTION_NUM_FAILED + exitCode)) { + LOGGER.error(MetricsMessages.GET_CONNECTION_NUM_FAILED, exitCode); + } + } + + private void logUpdateConnectionNumErrorIfNecessary(Exception e) { + if (shouldLogFailure( + connectionNumFailureLogState, MetricsMessages.UPDATE_CONNECTION_NUM_ERROR)) { LOGGER.error(MetricsMessages.UPDATE_CONNECTION_NUM_ERROR, e); } } + + private static boolean shouldLogFailure(FailureLogState failureLogState, String failureMessage) { + synchronized (failureLogState) { + long currentTime = System.currentTimeMillis(); + if (!failureMessage.equals(failureLogState.lastFailure) + || currentTime >= failureLogState.nextLogTime) { + failureLogState.lastFailure = failureMessage; + failureLogState.nextLogTime = currentTime + FAILURE_LOG_INTERVAL; + return true; + } + return false; + } + } + + private static void clearFailureLogState(FailureLogState failureLogState) { + synchronized (failureLogState) { + failureLogState.lastFailure = ""; + failureLogState.nextLogTime = 0L; + } + } + + private static class FailureLogState { + private long nextLogTime = 0L; + private String lastFailure = ""; + } } diff --git a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/system/SystemMetrics.java b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/system/SystemMetrics.java index e14d8910791a1..bd2022ad0ba79 100644 --- a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/system/SystemMetrics.java +++ b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/metricsets/system/SystemMetrics.java @@ -44,10 +44,15 @@ import java.util.HashSet; import java.util.List; import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; public class SystemMetrics implements IMetricSet { private static final Logger logger = LoggerFactory.getLogger(SystemMetrics.class); private static final MetricConfig CONFIG = MetricConfigDescriptor.getInstance().getMetricConfig(); + private static final long FAILURE_LOG_INTERVAL = TimeUnit.MINUTES.toMillis(5); private final Runtime runtime = Runtime.getRuntime(); private final String[] getSystemMemoryCommand = new String[] {"/bin/sh", "-c", "free"}; private final String[] linuxMemoryTitles = @@ -62,6 +67,8 @@ public class SystemMetrics implements IMetricSet { private final com.sun.management.OperatingSystemMXBean osMxBean; private Set fileStores = new HashSet<>(); private static final String FAILED_TO_STATISTIC = "Failed to statistic the size of {}, because"; + private final ConcurrentMap fileStoreStatisticFailureLastLogTimeMap = + new ConcurrentHashMap<>(); public SystemMetrics() { this.osMxBean = (OperatingSystemMXBean) ManagementFactory.getOperatingSystemMXBean(); @@ -338,8 +345,9 @@ public long getSystemDiskTotalSpace() { for (FileStore fileStore : fileStores) { try { sysTotalSpace += fileStore.getTotalSpace(); + resetFileStoreStatisticFailureLogTime(fileStore, SystemMetric.SYS_DISK_TOTAL_SPACE); } catch (IOException e) { - logger.error(FAILED_TO_STATISTIC, fileStore, e); + logFileStoreStatisticFailureIfNecessary(fileStore, SystemMetric.SYS_DISK_TOTAL_SPACE, e); } } return sysTotalSpace; @@ -350,8 +358,9 @@ public long getSystemDiskFreeSpace() { for (FileStore fileStore : fileStores) { try { sysFreeSpace += fileStore.getUnallocatedSpace(); + resetFileStoreStatisticFailureLogTime(fileStore, SystemMetric.SYS_DISK_FREE_SPACE); } catch (IOException e) { - logger.error(FAILED_TO_STATISTIC, fileStore, e); + logFileStoreStatisticFailureIfNecessary(fileStore, SystemMetric.SYS_DISK_FREE_SPACE, e); } } return sysFreeSpace; @@ -362,13 +371,38 @@ public long getSystemDiskAvailableSpace() { for (FileStore fileStore : fileStores) { try { sysAvailableSpace += fileStore.getUsableSpace(); + resetFileStoreStatisticFailureLogTime(fileStore, SystemMetric.SYS_DISK_AVAILABLE_SPACE); } catch (IOException e) { - logger.error(FAILED_TO_STATISTIC, fileStore, e); + logFileStoreStatisticFailureIfNecessary( + fileStore, SystemMetric.SYS_DISK_AVAILABLE_SPACE, e); } } return sysAvailableSpace; } + private void logFileStoreStatisticFailureIfNecessary( + FileStore fileStore, SystemMetric systemMetric, IOException e) { + AtomicLong lastLogTime = + fileStoreStatisticFailureLastLogTimeMap.computeIfAbsent( + getFileStoreStatisticFailureKey(fileStore, systemMetric), key -> new AtomicLong(0L)); + long currentTime = System.currentTimeMillis(); + long previousLogTime = lastLogTime.get(); + if ((previousLogTime == 0L || currentTime - previousLogTime >= FAILURE_LOG_INTERVAL) + && lastLogTime.compareAndSet(previousLogTime, currentTime)) { + logger.error(FAILED_TO_STATISTIC, fileStore, e); + } + } + + private void resetFileStoreStatisticFailureLogTime( + FileStore fileStore, SystemMetric systemMetric) { + fileStoreStatisticFailureLastLogTimeMap.remove( + getFileStoreStatisticFailureKey(fileStore, systemMetric)); + } + + private String getFileStoreStatisticFailureKey(FileStore fileStore, SystemMetric systemMetric) { + return systemMetric + ":" + fileStore; + } + public static SystemMetrics getInstance() { return SystemMetricsHolder.INSTANCE; } diff --git a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/reporter/iotdb/IoTDBSessionReporter.java b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/reporter/iotdb/IoTDBSessionReporter.java index 8e73776e53796..e12c840f4231a 100644 --- a/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/reporter/iotdb/IoTDBSessionReporter.java +++ b/iotdb-core/metrics/interface/src/main/java/org/apache/iotdb/metrics/reporter/iotdb/IoTDBSessionReporter.java @@ -48,12 +48,15 @@ public class IoTDBSessionReporter extends IoTDBReporter { private static final Logger LOGGER = LoggerFactory.getLogger(IoTDBSessionReporter.class); + private static final long FAILURE_LOG_INTERVAL = TimeUnit.MINUTES.toMillis(5); private static final MetricConfig metricConfig = MetricConfigDescriptor.getInstance().getMetricConfig(); private static final IoTDBReporterConfig ioTDBReporterConfig = MetricConfigDescriptor.getInstance().getMetricConfig().getIoTDBReporterConfig(); private Future currentServiceFuture; private final ScheduledExecutorService service = Executors.newSingleThreadScheduledExecutor(); + private final FailureLogState asyncReportFailureLogState = new FailureLogState(); + private final FailureLogState insertFailureLogState = new FailureLogState(); /** The manager of metrics. */ protected AbstractMetricManager metricManager; @@ -119,8 +122,9 @@ public boolean start() { values.put(prefix, value); } writeMetricsToIoTDB(values, System.currentTimeMillis()); + clearFailureLogState(asyncReportFailureLogState); } catch (Throwable t) { - LOGGER.error(MetricsMessages.IOTDB_SESSION_REPORTER_START_FAILED, t); + logAsyncReportFailureIfNecessary(t); } }, 1, @@ -164,8 +168,9 @@ protected void writeMetricToIoTDB(Map valueMap, String prefix, l try { sessionPool.insertRecord(prefix, time, sensors, dataTypes, values); + clearFailureLogState(insertFailureLogState); } catch (IoTDBConnectionException | StatementExecutionException e) { - LOGGER.warn(MetricsMessages.IOTDB_SESSION_REPORTER_INSERT_FAILED, e); + logInsertFailureIfNecessary(e); } } @@ -195,8 +200,48 @@ protected void writeMetricsToIoTDB(Map> valueMap, lo try { sessionPool.insertRecords(deviceIds, times, sensors, dataTypes, values); + clearFailureLogState(insertFailureLogState); } catch (IoTDBConnectionException | StatementExecutionException e) { + logInsertFailureIfNecessary(e); + } + } + + private void logAsyncReportFailureIfNecessary(Throwable t) { + if (shouldLogFailure( + asyncReportFailureLogState, MetricsMessages.IOTDB_SESSION_REPORTER_START_FAILED)) { + LOGGER.error(MetricsMessages.IOTDB_SESSION_REPORTER_START_FAILED, t); + } + } + + private void logInsertFailureIfNecessary(Exception e) { + if (shouldLogFailure( + insertFailureLogState, MetricsMessages.IOTDB_SESSION_REPORTER_INSERT_FAILED)) { LOGGER.warn(MetricsMessages.IOTDB_SESSION_REPORTER_INSERT_FAILED, e); } } + + static boolean shouldLogFailure(FailureLogState failureLogState, String failureMessage) { + synchronized (failureLogState) { + long currentTime = System.currentTimeMillis(); + if (!failureMessage.equals(failureLogState.lastFailure) + || currentTime >= failureLogState.nextLogTime) { + failureLogState.lastFailure = failureMessage; + failureLogState.nextLogTime = currentTime + FAILURE_LOG_INTERVAL; + return true; + } + return false; + } + } + + static void clearFailureLogState(FailureLogState failureLogState) { + synchronized (failureLogState) { + failureLogState.lastFailure = ""; + failureLogState.nextLogTime = 0L; + } + } + + static class FailureLogState { + private long nextLogTime = 0L; + private String lastFailure = ""; + } } diff --git a/iotdb-core/metrics/interface/src/test/java/org/apache/iotdb/metrics/metricsets/disk/LinuxDiskMetricsManagerTest.java b/iotdb-core/metrics/interface/src/test/java/org/apache/iotdb/metrics/metricsets/disk/LinuxDiskMetricsManagerTest.java new file mode 100644 index 0000000000000..8ff305bda99fc --- /dev/null +++ b/iotdb-core/metrics/interface/src/test/java/org/apache/iotdb/metrics/metricsets/disk/LinuxDiskMetricsManagerTest.java @@ -0,0 +1,168 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.metrics.metricsets.disk; + +import org.apache.iotdb.metrics.i18n.MetricsMessages; + +import ch.qos.logback.classic.Level; +import ch.qos.logback.classic.spi.ILoggingEvent; +import ch.qos.logback.core.read.ListAppender; +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.lang.reflect.Method; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.util.Arrays; +import java.util.Collections; + +public class LinuxDiskMetricsManagerTest { + + @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); + + @Test + public void diskInfoFailureLoggedOnlyOnceUntilRecovery() throws Exception { + File diskIdFolder = createDiskIdFolder(); + File processIoFile = tempFolder.newFile("process-io"); + writeProcessIoFile(processIoFile); + File diskStatsFile = new File(tempFolder.getRoot(), "diskstats"); + + LinuxDiskMetricsManager manager = + new LinuxDiskMetricsManager( + diskStatsFile.getAbsolutePath(), + diskIdFolder.getAbsolutePath(), + getDiskSectorSizePath(diskIdFolder), + processIoFile.getAbsolutePath()); + + ch.qos.logback.classic.Logger logger = + (ch.qos.logback.classic.Logger) LoggerFactory.getLogger(LinuxDiskMetricsManager.class); + Level previousLevel = logger.getLevel(); + logger.setLevel(Level.WARN); + ListAppender appender = new ListAppender<>(); + appender.setContext(logger.getLoggerContext()); + appender.start(); + logger.addAppender(appender); + + try { + invokeUpdateInfo(manager); + invokeUpdateInfo(manager); + Assert.assertEquals( + 1, countLogEvents(appender, MetricsMessages.CANNOT_FIND_DISK_IO_STATUS_FILE)); + + writeDiskStatsFile(diskStatsFile); + invokeUpdateInfo(manager); + + Files.delete(diskStatsFile.toPath()); + invokeUpdateInfo(manager); + Assert.assertEquals( + 2, countLogEvents(appender, MetricsMessages.CANNOT_FIND_DISK_IO_STATUS_FILE)); + } finally { + logger.detachAppender(appender); + logger.setLevel(previousLevel); + appender.stop(); + } + } + + @Test + public void missingProcessInfoLogsCannotFindOnlyOnceWithoutUpdateError() throws Exception { + File diskIdFolder = createDiskIdFolder(); + File diskStatsFile = tempFolder.newFile("diskstats"); + writeDiskStatsFile(diskStatsFile); + File processIoFile = new File(tempFolder.getRoot(), "missing-process-io"); + + LinuxDiskMetricsManager manager = + new LinuxDiskMetricsManager( + diskStatsFile.getAbsolutePath(), + diskIdFolder.getAbsolutePath(), + getDiskSectorSizePath(diskIdFolder), + processIoFile.getAbsolutePath()); + + ch.qos.logback.classic.Logger logger = + (ch.qos.logback.classic.Logger) LoggerFactory.getLogger(LinuxDiskMetricsManager.class); + Level previousLevel = logger.getLevel(); + logger.setLevel(Level.WARN); + ListAppender appender = new ListAppender<>(); + appender.setContext(logger.getLoggerContext()); + appender.start(); + logger.addAppender(appender); + + try { + invokeUpdateInfo(manager); + invokeUpdateInfo(manager); + + Assert.assertEquals( + 1, countLogEvents(appender, MetricsMessages.CANNOT_FIND_PROCESS_IO_STATUS_FILE)); + Assert.assertEquals( + 0, countLogEvents(appender, MetricsMessages.ERROR_UPDATING_PROCESS_IO_INFO)); + } finally { + logger.detachAppender(appender); + logger.setLevel(previousLevel); + appender.stop(); + } + } + + private File createDiskIdFolder() throws Exception { + File diskIdFolder = tempFolder.newFolder("block"); + File sectorSizeFolder = new File(diskIdFolder, "sda" + File.separator + "queue"); + Assert.assertTrue(sectorSizeFolder.mkdirs()); + Files.write( + new File(sectorSizeFolder, "hw_sector_size").toPath(), + Collections.singletonList("512"), + StandardCharsets.UTF_8); + return diskIdFolder; + } + + private String getDiskSectorSizePath(File diskIdFolder) { + return new File( + diskIdFolder, "%s" + File.separator + "queue" + File.separator + "hw_sector_size") + .getAbsolutePath(); + } + + private void writeDiskStatsFile(File diskStatsFile) throws Exception { + Files.write( + diskStatsFile.toPath(), + Collections.singletonList(" 8 0 sda 1 0 2 3 4 0 5 6 7 8 9"), + StandardCharsets.UTF_8); + } + + private void writeProcessIoFile(File processIoFile) throws Exception { + Files.write( + processIoFile.toPath(), + Arrays.asList( + "syscr: 1", "syscw: 2", "read_bytes: 3", "write_bytes: 4", "rchar: 5", "wchar: 6"), + StandardCharsets.UTF_8); + } + + private void invokeUpdateInfo(LinuxDiskMetricsManager manager) throws Exception { + Method updateInfo = LinuxDiskMetricsManager.class.getDeclaredMethod("updateInfo"); + updateInfo.setAccessible(true); + updateInfo.invoke(manager); + } + + private long countLogEvents(ListAppender appender, String messagePattern) { + return appender.list.stream() + .filter(event -> messagePattern.equals(event.getMessage())) + .count(); + } +} diff --git a/iotdb-core/metrics/interface/src/test/java/org/apache/iotdb/metrics/metricsets/disk/WindowsDiskMetricsManagerTest.java b/iotdb-core/metrics/interface/src/test/java/org/apache/iotdb/metrics/metricsets/disk/WindowsDiskMetricsManagerTest.java index 3105e8c7bd1b3..6606ce75514fc 100644 --- a/iotdb-core/metrics/interface/src/test/java/org/apache/iotdb/metrics/metricsets/disk/WindowsDiskMetricsManagerTest.java +++ b/iotdb-core/metrics/interface/src/test/java/org/apache/iotdb/metrics/metricsets/disk/WindowsDiskMetricsManagerTest.java @@ -19,9 +19,16 @@ package org.apache.iotdb.metrics.metricsets.disk; +import org.apache.iotdb.metrics.i18n.MetricsMessages; + +import ch.qos.logback.classic.Level; +import ch.qos.logback.classic.spi.ILoggingEvent; +import ch.qos.logback.core.read.ListAppender; import org.junit.Test; +import org.slf4j.LoggerFactory; import java.io.IOException; +import java.lang.reflect.Method; import java.util.Arrays; import java.util.Set; import java.util.concurrent.atomic.AtomicInteger; @@ -76,4 +83,124 @@ public void testPowerShellFailureSkipsFollowingQueryDuringBackoff() { assertTrue(manager.getDiskIds().isEmpty()); assertEquals(1, executeCount.get()); } + + @Test + public void unexpectedFormatFailureLoggedOnlyOnceUntilRecovery() throws Exception { + AtomicInteger diskQueryRound = new AtomicInteger(); + WindowsDiskMetricsManager manager = + new WindowsDiskMetricsManager( + "123", + command -> { + if (command.contains("PhysicalDisk")) { + int round = diskQueryRound.incrementAndGet(); + return new WindowsDiskMetricsManager.CommandResult( + 0, + round == 3 + ? Arrays.asList("0 C:\t1\t2\t1024\t4096\t0.001\t0.002\t75\t3") + : Arrays.asList("bad-disk-line")); + } + if (command.contains("PerfProc_Process")) { + return new WindowsDiskMetricsManager.CommandResult( + 0, + diskQueryRound.get() == 3 + ? Arrays.asList("3\t4\t8192\t16384") + : Arrays.asList("bad-process-line")); + } + return new WindowsDiskMetricsManager.CommandResult(1, Arrays.asList("unexpected")); + }); + + ch.qos.logback.classic.Logger logger = + (ch.qos.logback.classic.Logger) LoggerFactory.getLogger(WindowsDiskMetricsManager.class); + Level previousLevel = logger.getLevel(); + logger.setLevel(Level.WARN); + ListAppender appender = new ListAppender<>(); + appender.setContext(logger.getLoggerContext()); + appender.start(); + logger.addAppender(appender); + + try { + invokeUpdateInfo(manager); + invokeUpdateInfo(manager); + + assertEquals(1, countLogEvents(appender, MetricsMessages.UNEXPECTED_WINDOWS_DISK_IO_FORMAT)); + assertEquals( + 1, countLogEvents(appender, MetricsMessages.UNEXPECTED_WINDOWS_PROCESS_IO_FORMAT)); + + invokeUpdateInfo(manager); + invokeUpdateInfo(manager); + + assertEquals(2, countLogEvents(appender, MetricsMessages.UNEXPECTED_WINDOWS_DISK_IO_FORMAT)); + assertEquals( + 2, countLogEvents(appender, MetricsMessages.UNEXPECTED_WINDOWS_PROCESS_IO_FORMAT)); + } finally { + logger.detachAppender(appender); + logger.setLevel(previousLevel); + appender.stop(); + } + } + + @Test + public void parseFailureLoggedOnlyOnceUntilRecovery() throws Exception { + AtomicInteger diskQueryRound = new AtomicInteger(); + WindowsDiskMetricsManager manager = + new WindowsDiskMetricsManager( + "123", + command -> { + if (command.contains("PhysicalDisk")) { + int round = diskQueryRound.incrementAndGet(); + return new WindowsDiskMetricsManager.CommandResult( + 0, + Arrays.asList( + round == 3 + ? "0 C:\t1\t2\t1024\t4096\t0.001\t0.002\t75\t3" + : "0 C:\tnot-long\t2\t1024\t4096\tnot-double\t0.002\t75\t3")); + } + if (command.contains("PerfProc_Process")) { + return new WindowsDiskMetricsManager.CommandResult( + 0, Arrays.asList("3\t4\t8192\t16384")); + } + return new WindowsDiskMetricsManager.CommandResult(1, Arrays.asList("unexpected")); + }); + + ch.qos.logback.classic.Logger logger = + (ch.qos.logback.classic.Logger) LoggerFactory.getLogger(WindowsDiskMetricsManager.class); + Level previousLevel = logger.getLevel(); + logger.setLevel(Level.WARN); + ListAppender appender = new ListAppender<>(); + appender.setContext(logger.getLoggerContext()); + appender.start(); + logger.addAppender(appender); + + try { + invokeUpdateInfo(manager); + invokeUpdateInfo(manager); + + assertEquals(1, countLogEvents(appender, MetricsMessages.FAILED_TO_PARSE_LONG_WINDOWS_DISK)); + assertEquals( + 1, countLogEvents(appender, MetricsMessages.FAILED_TO_PARSE_DOUBLE_WINDOWS_DISK)); + + invokeUpdateInfo(manager); + invokeUpdateInfo(manager); + + assertEquals(2, countLogEvents(appender, MetricsMessages.FAILED_TO_PARSE_LONG_WINDOWS_DISK)); + assertEquals( + 2, countLogEvents(appender, MetricsMessages.FAILED_TO_PARSE_DOUBLE_WINDOWS_DISK)); + } finally { + logger.detachAppender(appender); + logger.setLevel(previousLevel); + appender.stop(); + } + } + + private void invokeUpdateInfo(WindowsDiskMetricsManager manager) throws Exception { + Method updateInfo = WindowsDiskMetricsManager.class.getDeclaredMethod("updateInfo"); + updateInfo.setAccessible(true); + updateInfo.invoke(manager); + } + + private long countLogEvents(ListAppender appender, String messagePattern) { + return appender.list.stream() + .filter(event -> messagePattern.equals(event.getMessage())) + .count(); + } } diff --git a/iotdb-core/metrics/interface/src/test/java/org/apache/iotdb/metrics/metricsets/system/SystemMetricsTest.java b/iotdb-core/metrics/interface/src/test/java/org/apache/iotdb/metrics/metricsets/system/SystemMetricsTest.java new file mode 100644 index 0000000000000..3e637357c7c7a --- /dev/null +++ b/iotdb-core/metrics/interface/src/test/java/org/apache/iotdb/metrics/metricsets/system/SystemMetricsTest.java @@ -0,0 +1,154 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.metrics.metricsets.system; + +import ch.qos.logback.classic.Level; +import ch.qos.logback.classic.spi.ILoggingEvent; +import ch.qos.logback.core.read.ListAppender; +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.lang.reflect.Field; +import java.nio.file.FileStore; +import java.nio.file.attribute.FileAttributeView; +import java.nio.file.attribute.FileStoreAttributeView; +import java.util.Collections; +import java.util.concurrent.atomic.AtomicBoolean; + +public class SystemMetricsTest { + + @Test + public void fileStoreFailureLoggedOnlyOnceUntilRecovery() throws Exception { + SystemMetrics systemMetrics = new SystemMetrics(); + RecoverableFileStore fileStore = new RecoverableFileStore("test-store"); + setFileStores(systemMetrics, fileStore); + + ch.qos.logback.classic.Logger logger = + (ch.qos.logback.classic.Logger) LoggerFactory.getLogger(SystemMetrics.class); + Level previousLevel = logger.getLevel(); + logger.setLevel(Level.ERROR); + ListAppender appender = new ListAppender<>(); + appender.setContext(logger.getLoggerContext()); + appender.start(); + logger.addAppender(appender); + + try { + systemMetrics.getSystemDiskTotalSpace(); + systemMetrics.getSystemDiskTotalSpace(); + + Assert.assertEquals(1, appender.list.size()); + + fileStore.setRecovered(true); + Assert.assertEquals(100L, systemMetrics.getSystemDiskTotalSpace()); + + fileStore.setRecovered(false); + systemMetrics.getSystemDiskTotalSpace(); + + Assert.assertEquals(2, appender.list.size()); + } finally { + logger.detachAppender(appender); + logger.setLevel(previousLevel); + appender.stop(); + } + } + + private void setFileStores(SystemMetrics systemMetrics, FileStore fileStore) throws Exception { + Field fileStoresField = SystemMetrics.class.getDeclaredField("fileStores"); + fileStoresField.setAccessible(true); + fileStoresField.set(systemMetrics, Collections.singleton(fileStore)); + } + + private static class RecoverableFileStore extends FileStore { + private final String name; + private final AtomicBoolean recovered = new AtomicBoolean(false); + + private RecoverableFileStore(String name) { + this.name = name; + } + + private void setRecovered(boolean recovered) { + this.recovered.set(recovered); + } + + @Override + public String name() { + return name; + } + + @Override + public String type() { + return "test"; + } + + @Override + public boolean isReadOnly() { + return false; + } + + @Override + public long getTotalSpace() throws IOException { + return getSpace(); + } + + @Override + public long getUsableSpace() throws IOException { + return getSpace(); + } + + @Override + public long getUnallocatedSpace() throws IOException { + return getSpace(); + } + + @Override + public boolean supportsFileAttributeView(Class type) { + return false; + } + + @Override + public boolean supportsFileAttributeView(String name) { + return false; + } + + @Override + public V getFileStoreAttributeView(Class type) { + return null; + } + + @Override + public Object getAttribute(String attribute) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public String toString() { + return name; + } + + private long getSpace() throws IOException { + if (recovered.get()) { + return 100L; + } + throw new IOException("disk failure"); + } + } +} diff --git a/iotdb-core/metrics/interface/src/test/java/org/apache/iotdb/metrics/reporter/iotdb/IoTDBSessionReporterTest.java b/iotdb-core/metrics/interface/src/test/java/org/apache/iotdb/metrics/reporter/iotdb/IoTDBSessionReporterTest.java new file mode 100644 index 0000000000000..0e3d0106c4d87 --- /dev/null +++ b/iotdb-core/metrics/interface/src/test/java/org/apache/iotdb/metrics/reporter/iotdb/IoTDBSessionReporterTest.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.metrics.reporter.iotdb; + +import org.apache.iotdb.metrics.i18n.MetricsMessages; + +import org.junit.Assert; +import org.junit.Test; + +public class IoTDBSessionReporterTest { + + @Test + public void failureLogStateSuppressesContinuousFailureUntilRecovery() { + IoTDBSessionReporter.FailureLogState failureLogState = + new IoTDBSessionReporter.FailureLogState(); + + Assert.assertTrue( + IoTDBSessionReporter.shouldLogFailure( + failureLogState, MetricsMessages.IOTDB_SESSION_REPORTER_INSERT_FAILED)); + Assert.assertFalse( + IoTDBSessionReporter.shouldLogFailure( + failureLogState, MetricsMessages.IOTDB_SESSION_REPORTER_INSERT_FAILED)); + + IoTDBSessionReporter.clearFailureLogState(failureLogState); + + Assert.assertTrue( + IoTDBSessionReporter.shouldLogFailure( + failureLogState, MetricsMessages.IOTDB_SESSION_REPORTER_INSERT_FAILED)); + } + + @Test + public void differentFailureMessageLogsImmediately() { + IoTDBSessionReporter.FailureLogState failureLogState = + new IoTDBSessionReporter.FailureLogState(); + + Assert.assertTrue( + IoTDBSessionReporter.shouldLogFailure( + failureLogState, MetricsMessages.IOTDB_SESSION_REPORTER_INSERT_FAILED)); + Assert.assertTrue( + IoTDBSessionReporter.shouldLogFailure( + failureLogState, MetricsMessages.IOTDB_SESSION_REPORTER_START_FAILED)); + } +} diff --git a/iotdb-core/node-commons/pom.xml b/iotdb-core/node-commons/pom.xml index f0a1afcb8221c..2c3d4bc7ca571 100644 --- a/iotdb-core/node-commons/pom.xml +++ b/iotdb-core/node-commons/pom.xml @@ -286,6 +286,20 @@ + + org.apache.maven.plugins + maven-dependency-plugin + + + ch.qos.logback:logback-classic + ch.qos.logback:logback-core + + + ch.qos.logback:logback-classic + ch.qos.logback:logback-core + + + diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/threadpool/ScheduledExecutorUtil.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/threadpool/ScheduledExecutorUtil.java index 4f1d4a5970542..e6e6ecade0f88 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/threadpool/ScheduledExecutorUtil.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/threadpool/ScheduledExecutorUtil.java @@ -31,6 +31,7 @@ public class ScheduledExecutorUtil { private static final Logger logger = LoggerFactory.getLogger(ScheduledExecutorUtil.class); + static final long SCHEDULE_TASK_FAILED_LOG_INTERVAL_MS = TimeUnit.MINUTES.toMillis(5); /** * A safe wrapper method to make sure the exception thrown by the previous running will not affect @@ -150,19 +151,7 @@ private static ScheduledFuture scheduleAtFixedRate( TimeUnit unit, boolean unsafe) { return executor.scheduleAtFixedRate( - () -> { - try { - command.run(); - } catch (Throwable t) { - logger.error(CommonMessages.SCHEDULE_TASK_FAILED, t); - if (unsafe) { - throw t; - } - } - }, - initialDelay, - period, - unit); + wrapScheduleCommand(command, unsafe), initialDelay, period, unit); } @SuppressWarnings("unsafeThreadSchedule") @@ -174,19 +163,64 @@ private static ScheduledFuture scheduleWithFixedDelay( TimeUnit unit, boolean unsafe) { return executor.scheduleWithFixedDelay( - () -> { - try { - command.run(); - } catch (Throwable t) { - logger.error(CommonMessages.SCHEDULE_TASK_FAILED, t); - if (unsafe) { - throw t; - } - } - }, - initialDelay, - delay, - unit); + wrapScheduleCommand(command, unsafe), initialDelay, delay, unit); + } + + private static Runnable wrapScheduleCommand(Runnable command, boolean unsafe) { + FailureLogState failureLogState = new FailureLogState(); + return () -> { + try { + command.run(); + failureLogState.clear(); + } catch (Throwable t) { + if (shouldLogFailure(failureLogState, t, System.currentTimeMillis())) { + logger.error(CommonMessages.SCHEDULE_TASK_FAILED, t); + } + if (unsafe) { + throw t; + } + } + }; + } + + static boolean shouldLogFailure( + FailureLogState failureLogState, Throwable failure, long currentTime) { + return failureLogState.shouldLog(getFailureSignature(failure), currentTime); + } + + private static String getFailureSignature(Throwable failure) { + StringBuilder builder = new StringBuilder(failure.getClass().getName()); + if (failure.getMessage() != null) { + builder.append(':').append(failure.getMessage()); + } + Throwable cause = failure.getCause(); + if (cause != null) { + builder.append(";cause=").append(cause.getClass().getName()); + if (cause.getMessage() != null) { + builder.append(':').append(cause.getMessage()); + } + } + return builder.toString(); + } + + static class FailureLogState { + + private String lastFailureSignature; + private long nextLogTime; + + private boolean shouldLog(String failureSignature, long currentTime) { + if (!failureSignature.equals(lastFailureSignature) || currentTime >= nextLogTime) { + lastFailureSignature = failureSignature; + nextLogTime = currentTime + SCHEDULE_TASK_FAILED_LOG_INTERVAL_MS; + return true; + } + return false; + } + + void clear() { + lastFailureSignature = null; + nextLogTime = 0; + } } public static RuntimeException propagate(Throwable throwable) { diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/disk/FolderManager.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/disk/FolderManager.java index 731dc66fa4a99..dd736452c6e1a 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/disk/FolderManager.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/disk/FolderManager.java @@ -20,6 +20,7 @@ package org.apache.iotdb.commons.disk; import org.apache.iotdb.commons.cluster.NodeStatus; +import org.apache.iotdb.commons.conf.CommonConfig; import org.apache.iotdb.commons.conf.CommonDescriptor; import org.apache.iotdb.commons.disk.strategy.DirectoryStrategy; import org.apache.iotdb.commons.disk.strategy.DirectoryStrategyType; @@ -30,6 +31,7 @@ import org.apache.iotdb.commons.exception.DiskSpaceInsufficientException; import org.apache.iotdb.commons.i18n.UtilMessages; import org.apache.iotdb.commons.utils.JVMCommonUtils; +import org.apache.iotdb.commons.utils.TestOnly; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -42,9 +44,15 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.atomic.AtomicLong; public class FolderManager { private static final Logger logger = LoggerFactory.getLogger(FolderManager.class); + private static final long ALL_FOLDERS_FULL_LOG_INTERVAL_MS = 3600 * 1000L; + private static final ConcurrentMap ALL_FOLDERS_FULL_LAST_LOG_TIME_MAP = + new ConcurrentHashMap<>(); /** Represents the operational states of a data folder. */ public enum FolderState { @@ -63,10 +71,12 @@ public enum FolderState { private final Map foldersStates = new HashMap<>(); private final DirectoryStrategy selectStrategy; + private final String foldersKey; public FolderManager(List folders, DirectoryStrategyType type) throws DiskSpaceInsufficientException { this.folders = folders; + this.foldersKey = folders.toString(); folders.forEach(dir -> foldersStates.put(dir, FolderState.HEALTHY)); switch (type) { case SEQUENCE_STRATEGY: @@ -87,10 +97,9 @@ public FolderManager(List folders, DirectoryStrategyType type) try { this.selectStrategy.setFolders(folders); this.selectStrategy.setFoldersStates(foldersStates); + resetAllFoldersFullLogTime(); } catch (DiskSpaceInsufficientException e) { - logger.error(UtilMessages.ALL_FOLDERS_FULL_CHANGE_TO_READ_ONLY, e); - CommonDescriptor.getInstance().getConfig().setNodeStatus(NodeStatus.ReadOnly); - CommonDescriptor.getInstance().getConfig().setStatusReason(NodeStatus.DISK_FULL); + handleAllFoldersFull(e); throw e; } } @@ -98,15 +107,18 @@ public FolderManager(List folders, DirectoryStrategyType type) public void updateFolderState(String folder, FolderState state) { foldersStates.replace(folder, state); selectStrategy.updateFolderState(folder, state); + if (FolderState.HEALTHY.equals(state)) { + resetAllFoldersFullLogTime(); + } } public String getNextFolder() throws DiskSpaceInsufficientException { try { - return folders.get(selectStrategy.nextFolderIndex()); + String folder = folders.get(selectStrategy.nextFolderIndex()); + resetAllFoldersFullLogTime(); + return folder; } catch (DiskSpaceInsufficientException e) { - logger.error(UtilMessages.ALL_FOLDERS_FULL_CHANGE_TO_READ_ONLY, e); - CommonDescriptor.getInstance().getConfig().setNodeStatus(NodeStatus.ReadOnly); - CommonDescriptor.getInstance().getConfig().setStatusReason(NodeStatus.DISK_FULL); + handleAllFoldersFull(e); throw e; } } @@ -135,19 +147,50 @@ public T getNextWithRetry(ThrowingFunction new AtomicLong(0L)); + long now = System.currentTimeMillis(); + long lastLogTime = lastAllFoldersFullLogTime.get(); + if ((lastLogTime == 0 || now - lastLogTime >= ALL_FOLDERS_FULL_LOG_INTERVAL_MS) + && lastAllFoldersFullLogTime.compareAndSet(lastLogTime, now)) { + logger.error(UtilMessages.ALL_FOLDERS_FULL_CHANGE_TO_READ_ONLY, e); + } + } + + void resetAllFoldersFullLogTime() { + ALL_FOLDERS_FULL_LAST_LOG_TIME_MAP.remove(foldersKey); + } + + @TestOnly + static void resetAllFoldersFullLogTimes() { + ALL_FOLDERS_FULL_LAST_LOG_TIME_MAP.clear(); } public List getFolders() { diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/disk/strategy/DirectoryStrategy.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/disk/strategy/DirectoryStrategy.java index e8f53ffe4c20f..c930972a99cc3 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/disk/strategy/DirectoryStrategy.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/disk/strategy/DirectoryStrategy.java @@ -18,11 +18,8 @@ */ package org.apache.iotdb.commons.disk.strategy; -import org.apache.iotdb.commons.cluster.NodeStatus; -import org.apache.iotdb.commons.conf.CommonDescriptor; import org.apache.iotdb.commons.disk.FolderManager; import org.apache.iotdb.commons.exception.DiskSpaceInsufficientException; -import org.apache.iotdb.commons.i18n.UtilMessages; import org.apache.iotdb.commons.utils.JVMCommonUtils; import org.slf4j.Logger; @@ -61,8 +58,6 @@ public void setFolders(List folders) throws DiskSpaceInsufficientExcepti } } if (!hasSpace) { - LOGGER.error(UtilMessages.DISK_SPACE_INSUFFICIENT_READ_ONLY); - CommonDescriptor.getInstance().getConfig().setNodeStatus(NodeStatus.ReadOnly); throw new DiskSpaceInsufficientException(folders); } diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/disk/strategy/MinFolderOccupiedSpaceFirstStrategy.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/disk/strategy/MinFolderOccupiedSpaceFirstStrategy.java index d7345eb505ca8..20a4a3552d173 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/disk/strategy/MinFolderOccupiedSpaceFirstStrategy.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/disk/strategy/MinFolderOccupiedSpaceFirstStrategy.java @@ -21,10 +21,17 @@ import org.apache.iotdb.commons.exception.DiskSpaceInsufficientException; import org.apache.iotdb.commons.i18n.UtilMessages; import org.apache.iotdb.commons.utils.JVMCommonUtils; +import org.apache.iotdb.commons.utils.TestOnly; import java.io.IOException; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.atomic.AtomicLong; public class MinFolderOccupiedSpaceFirstStrategy extends DirectoryStrategy { + private static final long CANNOT_CALCULATE_OCCUPIED_SPACE_LOG_INTERVAL_MS = 3600 * 1000L; + private static final ConcurrentMap + CANNOT_CALCULATE_OCCUPIED_SPACE_LAST_LOG_TIME_MAP = new ConcurrentHashMap<>(); @Override public int nextFolderIndex() throws DiskSpaceInsufficientException { @@ -47,8 +54,9 @@ private int getMinOccupiedSpaceFolder() throws DiskSpaceInsufficientException { long space = 0; try { space = JVMCommonUtils.getOccupiedSpace(folder); + resetCannotCalculateOccupiedSpaceLogTime(folder); } catch (IOException e) { - LOGGER.error(UtilMessages.CANNOT_CALCULATE_OCCUPIED_SPACE, folder, e); + logCannotCalculateOccupiedSpaceIfNecessary(folder, e); continue; } if (space < minSpace) { @@ -63,4 +71,26 @@ private int getMinOccupiedSpaceFolder() throws DiskSpaceInsufficientException { return minIndex; } + + private static void logCannotCalculateOccupiedSpaceIfNecessary(String folder, IOException e) { + AtomicLong lastLogTime = + CANNOT_CALCULATE_OCCUPIED_SPACE_LAST_LOG_TIME_MAP.computeIfAbsent( + folder, key -> new AtomicLong(0L)); + long now = System.currentTimeMillis(); + long previousLogTime = lastLogTime.get(); + if ((previousLogTime == 0 + || now - previousLogTime >= CANNOT_CALCULATE_OCCUPIED_SPACE_LOG_INTERVAL_MS) + && lastLogTime.compareAndSet(previousLogTime, now)) { + LOGGER.error(UtilMessages.CANNOT_CALCULATE_OCCUPIED_SPACE, folder, e); + } + } + + private static void resetCannotCalculateOccupiedSpaceLogTime(String folder) { + CANNOT_CALCULATE_OCCUPIED_SPACE_LAST_LOG_TIME_MAP.remove(folder); + } + + @TestOnly + public static void resetCannotCalculateOccupiedSpaceLogTimes() { + CANNOT_CALCULATE_OCCUPIED_SPACE_LAST_LOG_TIME_MAP.clear(); + } } diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/pipe/agent/runtime/AbstractPipePeriodicalJobExecutor.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/pipe/agent/runtime/AbstractPipePeriodicalJobExecutor.java index f6eb945b2735e..c28e555b3f1fd 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/pipe/agent/runtime/AbstractPipePeriodicalJobExecutor.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/pipe/agent/runtime/AbstractPipePeriodicalJobExecutor.java @@ -33,6 +33,7 @@ import java.util.concurrent.Future; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; /** * Single thread to execute pipe periodical jobs on DataNode or ConfigNode. This is for limiting the @@ -43,6 +44,8 @@ public abstract class AbstractPipePeriodicalJobExecutor { private static final Logger LOGGER = LoggerFactory.getLogger(AbstractPipePeriodicalJobExecutor.class); + private static final long PERIODICAL_JOB_FAILURE_LOG_INTERVAL_MS = TimeUnit.HOURS.toMillis(1); + private final ScheduledExecutorService executorService; private final long minIntervalSeconds; @@ -61,16 +64,7 @@ public AbstractPipePeriodicalJobExecutor( public void register(String id, Runnable periodicalJob, long intervalInSeconds) { periodicalJobs.add( new Pair<>( - new WrappedRunnable() { - @Override - public void runMayThrow() { - try { - periodicalJob.run(); - } catch (Exception e) { - LOGGER.warn(PipeMessages.PERIODICAL_JOB_FAILED, id, e); - } - } - }, + wrapPeriodicalJobWithFailureLogThrottle(id, periodicalJob), Math.max(intervalInSeconds / minIntervalSeconds, 1))); LOGGER.info( PipeMessages.PERIODICAL_JOB_REGISTERED, @@ -78,6 +72,33 @@ public void runMayThrow() { Math.max(intervalInSeconds / minIntervalSeconds, 1) * minIntervalSeconds); } + static WrappedRunnable wrapPeriodicalJobWithFailureLogThrottle( + String id, Runnable periodicalJob) { + return new WrappedRunnable() { + private final AtomicLong lastFailureLogTime = new AtomicLong(0L); + + @Override + public void runMayThrow() { + try { + periodicalJob.run(); + lastFailureLogTime.set(0L); + } catch (Exception e) { + logPeriodicalJobFailureIfNecessary(id, e, lastFailureLogTime); + } + } + }; + } + + private static void logPeriodicalJobFailureIfNecessary( + String id, Exception e, AtomicLong lastFailureLogTime) { + long now = System.currentTimeMillis(); + long previousLogTime = lastFailureLogTime.get(); + if ((previousLogTime == 0L || now - previousLogTime >= PERIODICAL_JOB_FAILURE_LOG_INTERVAL_MS) + && lastFailureLogTime.compareAndSet(previousLogTime, now)) { + LOGGER.warn(PipeMessages.PERIODICAL_JOB_FAILED, id, e); + } + } + public synchronized void start() { if (executorFuture == null) { rounds = 0; diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/JVMCommonUtils.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/JVMCommonUtils.java index ccbe525d3d591..64279e777685a 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/JVMCommonUtils.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/JVMCommonUtils.java @@ -31,11 +31,15 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.stream.Stream; public class JVMCommonUtils { private static final Logger LOGGER = LoggerFactory.getLogger(JVMCommonUtils.class); + private static final long DISK_WARNING_PRINT_INTERVAL_MS = 3600 * 1000L; /** Default executor pool maximum size. */ public static final int MAX_EXECUTOR_POOL_SIZE = Math.max(100, getCpuCores() * 5); @@ -44,6 +48,14 @@ public class JVMCommonUtils { private static double diskSpaceWarningThreshold = CommonDescriptor.getInstance().getConfig().getDiskSpaceWarningThreshold(); + private static final ConcurrentMap cannotGetFreeSpaceLastPrintTimeMap = + new ConcurrentHashMap<>(); + private static final ConcurrentMap diskAboveWarningThresholdLastPrintTimeMap = + new ConcurrentHashMap<>(); + private static final ConcurrentMap unexpectedGetUsableSpaceErrorLastPrintTimeMap = + new ConcurrentHashMap<>(); + private static final ConcurrentMap unexpectedGetDiskFreeRatioErrorLastPrintTimeMap = + new ConcurrentHashMap<>(); /** * get JDK version. @@ -69,10 +81,15 @@ public static long getUsableSpace(String dir) { try { File dirFile = FSFactoryProducer.getFSFactory().getFile(dir); dirFile.mkdirs(); - return IOUtils.retryNoException(5, 2000L, dirFile::getFreeSpace, space -> space > 0) - .orElse(0L); + long usableSpace = + IOUtils.retryNoException(5, 2000L, dirFile::getFreeSpace, space -> space > 0).orElse(0L); + unexpectedGetUsableSpaceErrorLastPrintTimeMap.remove(String.valueOf(dir)); + return usableSpace; } catch (Exception e) { - LOGGER.error(UtilMessages.UNEXPECTED_ERROR_CHECKING_DISK_SPACE_FOR_DIR, dir, e); + if (shouldPrintDiskWarning( + unexpectedGetUsableSpaceErrorLastPrintTimeMap, String.valueOf(dir))) { + LOGGER.error(UtilMessages.UNEXPECTED_ERROR_CHECKING_DISK_SPACE_FOR_DIR, dir, e); + } return 0L; } } @@ -87,20 +104,28 @@ public static double getDiskFreeRatio(String dir) { long freeSpace = IOUtils.retryNoException(5, 2000L, dirFile::getFreeSpace, space -> space > 0).orElse(0L); if (freeSpace == 0) { - LOGGER.warn(UtilMessages.CANNOT_GET_FREE_SPACE, dir); + if (shouldPrintDiskWarning(cannotGetFreeSpaceLastPrintTimeMap, dir)) { + LOGGER.warn(UtilMessages.CANNOT_GET_FREE_SPACE, dir); + } + } else { + cannotGetFreeSpaceLastPrintTimeMap.remove(dir); } long totalSpace = dirFile.getTotalSpace(); double ratio = 1.0 * freeSpace / totalSpace; if (ratio <= diskSpaceWarningThreshold) { - LOGGER.warn( - "{} is above the warning threshold, free space {}, total space {}", - dir, - freeSpace, - totalSpace); + if (shouldPrintDiskWarning(diskAboveWarningThresholdLastPrintTimeMap, dir)) { + LOGGER.warn(UtilMessages.DISK_ABOVE_WARNING_THRESHOLD, dir, freeSpace, totalSpace); + } + } else { + diskAboveWarningThresholdLastPrintTimeMap.remove(dir); } + unexpectedGetDiskFreeRatioErrorLastPrintTimeMap.remove(String.valueOf(dir)); return ratio; } catch (Exception e) { - LOGGER.error(UtilMessages.UNEXPECTED_ERROR_CHECKING_DISK_SPACE, dir, e); + if (shouldPrintDiskWarning( + unexpectedGetDiskFreeRatioErrorLastPrintTimeMap, String.valueOf(dir))) { + LOGGER.error(UtilMessages.UNEXPECTED_ERROR_CHECKING_DISK_SPACE, dir, e); + } return 0; } } @@ -128,4 +153,28 @@ public static int getMaxExecutorPoolSize() { public static void setDiskSpaceWarningThreshold(double threshold) { diskSpaceWarningThreshold = threshold; } + + @TestOnly + static void resetDiskWarningLastPrintTimes() { + cannotGetFreeSpaceLastPrintTimeMap.clear(); + diskAboveWarningThresholdLastPrintTimeMap.clear(); + unexpectedGetUsableSpaceErrorLastPrintTimeMap.clear(); + unexpectedGetDiskFreeRatioErrorLastPrintTimeMap.clear(); + } + + private static boolean shouldPrintDiskWarning( + ConcurrentMap lastPrintTimeMap, String dir) { + long now = System.currentTimeMillis(); + AtomicBoolean shouldPrint = new AtomicBoolean(false); + lastPrintTimeMap.compute( + dir, + (key, lastPrintTime) -> { + if (lastPrintTime == null || now - lastPrintTime > DISK_WARNING_PRINT_INTERVAL_MS) { + shouldPrint.set(true); + return now; + } + return lastPrintTime; + }); + return shouldPrint.get(); + } } diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/LogThrottler.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/LogThrottler.java new file mode 100644 index 0000000000000..cdc5dd8716759 --- /dev/null +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/LogThrottler.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.commons.utils; + +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.TimeUnit; + +public class LogThrottler { + + public static final long DEFAULT_LOG_INTERVAL_MS = TimeUnit.MINUTES.toMillis(5); + + private static final String DEFAULT_KEY = ""; + + private final long logIntervalMs; + private final ConcurrentMap failureLogStateMap = + new ConcurrentHashMap<>(); + + public LogThrottler() { + this(DEFAULT_LOG_INTERVAL_MS); + } + + public LogThrottler(long logIntervalMs) { + if (logIntervalMs < 0) { + throw new IllegalArgumentException("Log interval should not be negative"); + } + this.logIntervalMs = logIntervalMs; + } + + public boolean shouldLog(String failureSignature) { + return shouldLog(DEFAULT_KEY, failureSignature); + } + + public boolean shouldLog(String key, String failureSignature) { + return shouldLog(key, failureSignature, System.currentTimeMillis()); + } + + @TestOnly + boolean shouldLog(String failureSignature, long currentTime) { + return shouldLog(DEFAULT_KEY, failureSignature, currentTime); + } + + @TestOnly + boolean shouldLog(String key, String failureSignature, long currentTime) { + String safeKey = key == null ? DEFAULT_KEY : key; + String safeFailureSignature = String.valueOf(failureSignature); + return failureLogStateMap + .computeIfAbsent(safeKey, ignored -> new FailureLogState()) + .shouldLog(safeFailureSignature, currentTime, logIntervalMs); + } + + public void reset() { + failureLogStateMap.clear(); + } + + public void reset(String key) { + failureLogStateMap.remove(key == null ? DEFAULT_KEY : key); + } + + public static String getFailureSignature(Throwable failure) { + StringBuilder builder = new StringBuilder(failure.getClass().getName()); + if (failure.getMessage() != null) { + builder.append(':').append(failure.getMessage()); + } + Throwable cause = failure.getCause(); + if (cause != null) { + builder.append(";cause=").append(cause.getClass().getName()); + if (cause.getMessage() != null) { + builder.append(':').append(cause.getMessage()); + } + } + return builder.toString(); + } + + private static class FailureLogState { + + private String lastFailureSignature = ""; + private long nextLogTime = 0L; + + private synchronized boolean shouldLog( + String failureSignature, long currentTime, long logIntervalMs) { + if (!failureSignature.equals(lastFailureSignature) || currentTime >= nextLogTime) { + lastFailureSignature = failureSignature; + nextLogTime = currentTime + logIntervalMs; + return true; + } + return false; + } + } +} diff --git a/iotdb-core/node-commons/src/test/java/org/apache/iotdb/commons/concurrent/threadpool/ScheduledExecutorUtilTest.java b/iotdb-core/node-commons/src/test/java/org/apache/iotdb/commons/concurrent/threadpool/ScheduledExecutorUtilTest.java new file mode 100644 index 0000000000000..a94eaa5c4d5cb --- /dev/null +++ b/iotdb-core/node-commons/src/test/java/org/apache/iotdb/commons/concurrent/threadpool/ScheduledExecutorUtilTest.java @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.commons.concurrent.threadpool; + +import org.apache.iotdb.commons.i18n.CommonMessages; + +import ch.qos.logback.classic.Level; +import ch.qos.logback.classic.spi.ILoggingEvent; +import ch.qos.logback.core.read.ListAppender; +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.LoggerFactory; + +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.ScheduledFuture; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; + +public class ScheduledExecutorUtilTest { + + @Test + public void sameFailureLoggedOnlyAfterIntervalOrRecovery() { + ScheduledExecutorUtil.FailureLogState failureLogState = + new ScheduledExecutorUtil.FailureLogState(); + RuntimeException failure = new RuntimeException("same failure"); + long startTime = 1000; + + Assert.assertTrue(ScheduledExecutorUtil.shouldLogFailure(failureLogState, failure, startTime)); + Assert.assertFalse( + ScheduledExecutorUtil.shouldLogFailure(failureLogState, failure, startTime + 1)); + + Assert.assertTrue( + ScheduledExecutorUtil.shouldLogFailure( + failureLogState, + failure, + startTime + ScheduledExecutorUtil.SCHEDULE_TASK_FAILED_LOG_INTERVAL_MS)); + Assert.assertFalse( + ScheduledExecutorUtil.shouldLogFailure( + failureLogState, + failure, + startTime + ScheduledExecutorUtil.SCHEDULE_TASK_FAILED_LOG_INTERVAL_MS + 1)); + + failureLogState.clear(); + Assert.assertTrue( + ScheduledExecutorUtil.shouldLogFailure(failureLogState, failure, startTime + 2)); + } + + @Test + public void differentFailuresLoggedImmediately() { + ScheduledExecutorUtil.FailureLogState failureLogState = + new ScheduledExecutorUtil.FailureLogState(); + long startTime = 1000; + + Assert.assertTrue( + ScheduledExecutorUtil.shouldLogFailure( + failureLogState, new RuntimeException("first"), startTime)); + Assert.assertFalse( + ScheduledExecutorUtil.shouldLogFailure( + failureLogState, new RuntimeException("first"), startTime + 1)); + Assert.assertTrue( + ScheduledExecutorUtil.shouldLogFailure( + failureLogState, new RuntimeException("second"), startTime + 2)); + Assert.assertTrue( + ScheduledExecutorUtil.shouldLogFailure( + failureLogState, new IllegalStateException("second"), startTime + 3)); + } + + @Test + public void safeScheduledCommandSuppressesRepeatedFailureUntilSuccess() throws Exception { + ch.qos.logback.classic.Logger logger = + (ch.qos.logback.classic.Logger) LoggerFactory.getLogger(ScheduledExecutorUtil.class); + Level previousLevel = logger.getLevel(); + logger.setLevel(Level.ERROR); + ListAppender appender = new ListAppender<>(); + appender.setContext(logger.getLoggerContext()); + appender.start(); + logger.addAppender(appender); + + ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor(); + ScheduledFuture future = null; + try { + AtomicInteger runs = new AtomicInteger(); + CountDownLatch latch = new CountDownLatch(5); + future = + ScheduledExecutorUtil.safelyScheduleWithFixedDelay( + executor, + () -> { + int run = runs.incrementAndGet(); + latch.countDown(); + if (run <= 2 || run == 4) { + throw new RuntimeException("transient failure"); + } + }, + 0, + 10, + TimeUnit.MILLISECONDS); + + Assert.assertTrue(latch.await(10, TimeUnit.SECONDS)); + future.cancel(true); + + Assert.assertEquals(2, countLogEvents(appender, CommonMessages.SCHEDULE_TASK_FAILED)); + } finally { + if (future != null) { + future.cancel(true); + } + executor.shutdownNow(); + logger.detachAppender(appender); + logger.setLevel(previousLevel); + appender.stop(); + } + } + + private long countLogEvents(ListAppender appender, String messagePattern) { + return appender.list.stream() + .filter(event -> messagePattern.equals(event.getMessage())) + .count(); + } +} diff --git a/iotdb-core/node-commons/src/test/java/org/apache/iotdb/commons/disk/FolderManagerLogTest.java b/iotdb-core/node-commons/src/test/java/org/apache/iotdb/commons/disk/FolderManagerLogTest.java new file mode 100644 index 0000000000000..407de2658bfb5 --- /dev/null +++ b/iotdb-core/node-commons/src/test/java/org/apache/iotdb/commons/disk/FolderManagerLogTest.java @@ -0,0 +1,232 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.commons.disk; + +import org.apache.iotdb.commons.cluster.NodeStatus; +import org.apache.iotdb.commons.conf.CommonConfig; +import org.apache.iotdb.commons.conf.CommonDescriptor; +import org.apache.iotdb.commons.disk.FolderManager.FolderState; +import org.apache.iotdb.commons.disk.strategy.DirectoryStrategyType; +import org.apache.iotdb.commons.exception.DiskSpaceInsufficientException; +import org.apache.iotdb.commons.i18n.UtilMessages; +import org.apache.iotdb.commons.utils.JVMCommonUtils; + +import ch.qos.logback.classic.Level; +import ch.qos.logback.classic.spi.ILoggingEvent; +import ch.qos.logback.core.read.ListAppender; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.util.Arrays; +import java.util.List; + +public class FolderManagerLogTest { + + private static final CommonConfig COMMON_CONFIG = CommonDescriptor.getInstance().getConfig(); + + @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); + + private FolderManager folderManager; + private List testFolders; + private NodeStatus previousNodeStatus; + private String previousStatusReason; + private double previousDiskSpaceWarningThreshold; + + @Before + public void setUp() throws Exception { + previousNodeStatus = COMMON_CONFIG.getNodeStatus(); + previousStatusReason = COMMON_CONFIG.getStatusReason(); + previousDiskSpaceWarningThreshold = COMMON_CONFIG.getDiskSpaceWarningThreshold(); + COMMON_CONFIG.setNodeStatus(NodeStatus.Running); + COMMON_CONFIG.setStatusReason(null); + setDiskSpaceWarningThresholdForTest(previousDiskSpaceWarningThreshold); + FolderManager.resetAllFoldersFullLogTimes(); + + File folder1 = tempFolder.newFolder("folder1"); + File folder2 = tempFolder.newFolder("folder2"); + File folder3 = tempFolder.newFolder("folder3"); + testFolders = + Arrays.asList( + folder1.getAbsolutePath(), folder2.getAbsolutePath(), folder3.getAbsolutePath()); + folderManager = new FolderManager(testFolders, DirectoryStrategyType.SEQUENCE_STRATEGY); + } + + @After + public void tearDown() { + setDiskSpaceWarningThresholdForTest(previousDiskSpaceWarningThreshold); + FolderManager.resetAllFoldersFullLogTimes(); + COMMON_CONFIG.setNodeStatus(previousNodeStatus); + COMMON_CONFIG.setStatusReason(previousStatusReason); + } + + @Test + public void allFoldersFullErrorLoggedOnlyOnceUntilFolderRecovers() throws Exception { + ch.qos.logback.classic.Logger logger = + (ch.qos.logback.classic.Logger) LoggerFactory.getLogger(FolderManager.class); + Level previousLevel = logger.getLevel(); + logger.setLevel(Level.ERROR); + ListAppender appender = new ListAppender<>(); + appender.setContext(logger.getLoggerContext()); + appender.start(); + logger.addAppender(appender); + + try { + markAllFolders(FolderState.ABNORMAL); + assertDiskFullOnNextFolder(); + assertDiskFullOnNextFolder(); + + Assert.assertEquals( + 1, countLogEvents(appender, UtilMessages.ALL_FOLDERS_FULL_CHANGE_TO_READ_ONLY)); + Assert.assertEquals(NodeStatus.ReadOnly, COMMON_CONFIG.getNodeStatus()); + Assert.assertEquals(NodeStatus.DISK_FULL, COMMON_CONFIG.getStatusReason()); + + folderManager.updateFolderState(testFolders.get(0), FolderState.HEALTHY); + Assert.assertNotNull(folderManager.getNextFolder()); + + markAllFolders(FolderState.ABNORMAL); + assertDiskFullOnNextFolder(); + Assert.assertEquals( + 2, countLogEvents(appender, UtilMessages.ALL_FOLDERS_FULL_CHANGE_TO_READ_ONLY)); + } finally { + logger.detachAppender(appender); + logger.setLevel(previousLevel); + appender.stop(); + } + } + + @Test + public void allFoldersFullErrorLoggedOnlyOnceAcrossFailedConstructions() throws Exception { + ch.qos.logback.classic.Logger logger = + (ch.qos.logback.classic.Logger) LoggerFactory.getLogger(FolderManager.class); + Level previousLevel = logger.getLevel(); + logger.setLevel(Level.ERROR); + ListAppender appender = new ListAppender<>(); + appender.setContext(logger.getLoggerContext()); + appender.start(); + logger.addAppender(appender); + + try { + setDiskSpaceWarningThresholdForTest(1.1); + assertDiskFullOnNewFolderManager(); + assertDiskFullOnNewFolderManager(); + + Assert.assertEquals( + 1, countLogEvents(appender, UtilMessages.ALL_FOLDERS_FULL_CHANGE_TO_READ_ONLY)); + + setDiskSpaceWarningThresholdForTest(-1.0); + Assert.assertNotNull(new FolderManager(testFolders, DirectoryStrategyType.SEQUENCE_STRATEGY)); + + setDiskSpaceWarningThresholdForTest(1.1); + assertDiskFullOnNewFolderManager(); + Assert.assertEquals( + 2, countLogEvents(appender, UtilMessages.ALL_FOLDERS_FULL_CHANGE_TO_READ_ONLY)); + } finally { + logger.detachAppender(appender); + logger.setLevel(previousLevel); + appender.stop(); + } + } + + @Test + public void getNextWithRetryLogsAllFoldersFullOnlyOnceUntilFolderRecovers() throws Exception { + ch.qos.logback.classic.Logger logger = + (ch.qos.logback.classic.Logger) LoggerFactory.getLogger(FolderManager.class); + Level previousLevel = logger.getLevel(); + logger.setLevel(Level.ERROR); + ListAppender appender = new ListAppender<>(); + appender.setContext(logger.getLoggerContext()); + appender.start(); + logger.addAppender(appender); + + try { + assertDiskFullAfterFolderProcessingRetriesFail(); + assertDiskFullAfterFolderProcessingRetriesFail(); + + Assert.assertEquals( + 1, countLogEvents(appender, UtilMessages.ALL_FOLDERS_FULL_CHANGE_TO_READ_ONLY)); + Assert.assertEquals(NodeStatus.ReadOnly, COMMON_CONFIG.getNodeStatus()); + Assert.assertEquals(NodeStatus.DISK_FULL, COMMON_CONFIG.getStatusReason()); + + folderManager.updateFolderState(testFolders.get(0), FolderState.HEALTHY); + Assert.assertEquals( + testFolders.get(0), folderManager.getNextWithRetry(folder -> testFolders.get(0))); + + markAllFolders(FolderState.ABNORMAL); + assertDiskFullAfterFolderProcessingRetriesFail(); + Assert.assertEquals( + 2, countLogEvents(appender, UtilMessages.ALL_FOLDERS_FULL_CHANGE_TO_READ_ONLY)); + } finally { + logger.detachAppender(appender); + logger.setLevel(previousLevel); + appender.stop(); + } + } + + private void markAllFolders(FolderState state) { + testFolders.forEach(folder -> folderManager.updateFolderState(folder, state)); + } + + private void assertDiskFullOnNextFolder() { + try { + folderManager.getNextFolder(); + Assert.fail("Expected DiskSpaceInsufficientException"); + } catch (DiskSpaceInsufficientException e) { + Assert.assertTrue(e.getMessage().contains("Can't get next folder")); + } + } + + private void assertDiskFullOnNewFolderManager() { + try { + new FolderManager(testFolders, DirectoryStrategyType.SEQUENCE_STRATEGY); + Assert.fail("Expected DiskSpaceInsufficientException"); + } catch (DiskSpaceInsufficientException e) { + Assert.assertTrue(e.getMessage().contains("Can't get next folder")); + } + } + + private void assertDiskFullAfterFolderProcessingRetriesFail() { + try { + folderManager.getNextWithRetry( + folder -> { + throw new RuntimeException("Failed to process folder"); + }); + Assert.fail("Expected DiskSpaceInsufficientException"); + } catch (DiskSpaceInsufficientException e) { + Assert.assertTrue(e.getMessage().contains("Can't get next folder")); + } + } + + private void setDiskSpaceWarningThresholdForTest(double threshold) { + COMMON_CONFIG.setDiskSpaceWarningThreshold(threshold); + JVMCommonUtils.setDiskSpaceWarningThreshold(threshold); + } + + private long countLogEvents(ListAppender appender, String messagePattern) { + return appender.list.stream() + .filter(event -> messagePattern.equals(event.getMessage())) + .count(); + } +} diff --git a/iotdb-core/node-commons/src/test/java/org/apache/iotdb/commons/pipe/agent/runtime/AbstractPipePeriodicalJobExecutorLogTest.java b/iotdb-core/node-commons/src/test/java/org/apache/iotdb/commons/pipe/agent/runtime/AbstractPipePeriodicalJobExecutorLogTest.java new file mode 100644 index 0000000000000..ba52513a44ef2 --- /dev/null +++ b/iotdb-core/node-commons/src/test/java/org/apache/iotdb/commons/pipe/agent/runtime/AbstractPipePeriodicalJobExecutorLogTest.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.commons.pipe.agent.runtime; + +import org.apache.iotdb.commons.concurrent.WrappedRunnable; +import org.apache.iotdb.commons.i18n.PipeMessages; + +import ch.qos.logback.classic.Level; +import ch.qos.logback.classic.spi.ILoggingEvent; +import ch.qos.logback.core.read.ListAppender; +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.LoggerFactory; + +import java.util.concurrent.atomic.AtomicBoolean; + +public class AbstractPipePeriodicalJobExecutorLogTest { + + @Test + public void periodicalJobFailureLoggedOnlyOnceUntilSuccess() { + ch.qos.logback.classic.Logger logger = + (ch.qos.logback.classic.Logger) + LoggerFactory.getLogger(AbstractPipePeriodicalJobExecutor.class); + Level previousLevel = logger.getLevel(); + logger.setLevel(Level.WARN); + ListAppender appender = new ListAppender<>(); + appender.setContext(logger.getLoggerContext()); + appender.start(); + logger.addAppender(appender); + + AtomicBoolean shouldFail = new AtomicBoolean(true); + WrappedRunnable job = + AbstractPipePeriodicalJobExecutor.wrapPeriodicalJobWithFailureLogThrottle( + "testJob", + () -> { + if (shouldFail.get()) { + throw new RuntimeException("periodical job failure"); + } + }); + + try { + job.run(); + job.run(); + Assert.assertEquals(1, countLogEvents(appender, PipeMessages.PERIODICAL_JOB_FAILED)); + + shouldFail.set(false); + job.run(); + + shouldFail.set(true); + job.run(); + Assert.assertEquals(2, countLogEvents(appender, PipeMessages.PERIODICAL_JOB_FAILED)); + } finally { + logger.detachAppender(appender); + logger.setLevel(previousLevel); + appender.stop(); + } + } + + private long countLogEvents(ListAppender appender, String messagePattern) { + return appender.list.stream() + .filter(event -> messagePattern.equals(event.getMessage())) + .count(); + } +} diff --git a/iotdb-core/node-commons/src/test/java/org/apache/iotdb/commons/utils/JVMCommonUtilsTest.java b/iotdb-core/node-commons/src/test/java/org/apache/iotdb/commons/utils/JVMCommonUtilsTest.java index 4a247cfb76549..2b611781f4bad 100644 --- a/iotdb-core/node-commons/src/test/java/org/apache/iotdb/commons/utils/JVMCommonUtilsTest.java +++ b/iotdb-core/node-commons/src/test/java/org/apache/iotdb/commons/utils/JVMCommonUtilsTest.java @@ -19,8 +19,19 @@ package org.apache.iotdb.commons.utils; +import org.apache.iotdb.commons.conf.CommonDescriptor; +import org.apache.iotdb.commons.i18n.UtilMessages; + +import ch.qos.logback.classic.Level; +import ch.qos.logback.classic.spi.ILoggingEvent; +import ch.qos.logback.core.read.ListAppender; import org.junit.Assert; import org.junit.Test; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; public class JVMCommonUtilsTest { @@ -39,4 +50,67 @@ public void getJdkVersionTest() { Assert.fail(); } } + + @Test + public void unexpectedDiskSpaceErrorsLoggedOnlyOnceWhileErrorPersists() { + ch.qos.logback.classic.Logger logger = + (ch.qos.logback.classic.Logger) LoggerFactory.getLogger(JVMCommonUtils.class); + Level previousLevel = logger.getLevel(); + logger.setLevel(Level.ERROR); + ListAppender appender = new ListAppender<>(); + appender.setContext(logger.getLoggerContext()); + appender.start(); + logger.addAppender(appender); + + JVMCommonUtils.resetDiskWarningLastPrintTimes(); + try { + JVMCommonUtils.getUsableSpace(null); + JVMCommonUtils.getUsableSpace(null); + Assert.assertEquals( + 1, countLogEvents(appender, UtilMessages.UNEXPECTED_ERROR_CHECKING_DISK_SPACE_FOR_DIR)); + + JVMCommonUtils.getDiskFreeRatio(null); + JVMCommonUtils.getDiskFreeRatio(null); + Assert.assertEquals( + 1, countLogEvents(appender, UtilMessages.UNEXPECTED_ERROR_CHECKING_DISK_SPACE)); + } finally { + JVMCommonUtils.resetDiskWarningLastPrintTimes(); + logger.detachAppender(appender); + logger.setLevel(previousLevel); + appender.stop(); + } + } + + @Test + public void getDiskFreeRatioWarnsOnlyOnceWhileDiskWarningPersists() throws IOException { + Path dir = Files.createTempDirectory("jvm-common-utils-test"); + ch.qos.logback.classic.Logger logger = + (ch.qos.logback.classic.Logger) LoggerFactory.getLogger(JVMCommonUtils.class); + ListAppender appender = new ListAppender<>(); + appender.setContext(logger.getLoggerContext()); + appender.start(); + logger.addAppender(appender); + + JVMCommonUtils.resetDiskWarningLastPrintTimes(); + JVMCommonUtils.setDiskSpaceWarningThreshold(1.1); + try { + JVMCommonUtils.getDiskFreeRatio(dir.toString()); + JVMCommonUtils.getDiskFreeRatio(dir.toString()); + + Assert.assertEquals(1, countLogEvents(appender, UtilMessages.DISK_ABOVE_WARNING_THRESHOLD)); + } finally { + JVMCommonUtils.setDiskSpaceWarningThreshold( + CommonDescriptor.getInstance().getConfig().getDiskSpaceWarningThreshold()); + JVMCommonUtils.resetDiskWarningLastPrintTimes(); + logger.detachAppender(appender); + appender.stop(); + Files.deleteIfExists(dir); + } + } + + private long countLogEvents(ListAppender appender, String messagePattern) { + return appender.list.stream() + .filter(event -> messagePattern.equals(event.getMessage())) + .count(); + } } diff --git a/iotdb-core/node-commons/src/test/java/org/apache/iotdb/commons/utils/LogThrottlerTest.java b/iotdb-core/node-commons/src/test/java/org/apache/iotdb/commons/utils/LogThrottlerTest.java new file mode 100644 index 0000000000000..46d8580859a4a --- /dev/null +++ b/iotdb-core/node-commons/src/test/java/org/apache/iotdb/commons/utils/LogThrottlerTest.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.commons.utils; + +import org.junit.Assert; +import org.junit.Test; + +public class LogThrottlerTest { + + @Test + public void sameFailureLoggedOnlyAfterIntervalOrReset() { + LogThrottler logThrottler = new LogThrottler(); + long startTime = 1000L; + + Assert.assertTrue(logThrottler.shouldLog("failure", startTime)); + Assert.assertFalse(logThrottler.shouldLog("failure", startTime + 1)); + + Assert.assertTrue( + logThrottler.shouldLog("failure", startTime + LogThrottler.DEFAULT_LOG_INTERVAL_MS)); + Assert.assertFalse( + logThrottler.shouldLog("failure", startTime + LogThrottler.DEFAULT_LOG_INTERVAL_MS + 1)); + + logThrottler.reset(); + Assert.assertTrue(logThrottler.shouldLog("failure", startTime + 2)); + } + + @Test + public void differentFailuresAndKeysAreLoggedIndependently() { + LogThrottler logThrottler = new LogThrottler(); + long startTime = 1000L; + + Assert.assertTrue(logThrottler.shouldLog("key1", "failure1", startTime)); + Assert.assertFalse(logThrottler.shouldLog("key1", "failure1", startTime + 1)); + Assert.assertTrue(logThrottler.shouldLog("key1", "failure2", startTime + 2)); + + Assert.assertTrue(logThrottler.shouldLog("key2", "failure1", startTime + 3)); + Assert.assertFalse(logThrottler.shouldLog("key2", "failure1", startTime + 4)); + + logThrottler.reset("key2"); + Assert.assertFalse(logThrottler.shouldLog("key1", "failure2", startTime + 5)); + Assert.assertTrue(logThrottler.shouldLog("key2", "failure1", startTime + 6)); + } + + @Test + public void exceptionSignatureIncludesCause() { + RuntimeException failure = new RuntimeException("outer", new IllegalStateException("inner")); + + Assert.assertEquals( + "java.lang.RuntimeException:outer;cause=java.lang.IllegalStateException:inner", + LogThrottler.getFailureSignature(failure)); + } + + @Test(expected = IllegalArgumentException.class) + public void negativeIntervalRejected() { + new LogThrottler(-1); + } +}