InfiniTensor · chen2021673 · May 21, 2026 · kilinchange · May 23, 2026 · kilinchange
diff --git a/.clang-format b/.clang-format
@@ -1,13 +1,50 @@
 ---
-BasedOnStyle: LLVM
+BasedOnStyle: Google
 IndentWidth: 4
 AccessModifierOffset: -4
+PointerAlignment: Right
+DerivePointerAlignment: false
+AlignEscapedNewlines: Right
 AlignOperands: AlignAfterOperator
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: MultiLine
 BreakBeforeBinaryOperators: All
 ColumnLimit: 120
-AllowShortBlocksOnASingleLine: Always 
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyReturnTypeOnItsOwnLine: 60
+AllowShortBlocksOnASingleLine: Always
+AllowShortIfStatementsOnASingleLine: Never
 AllowShortLoopsOnASingleLine: true
+IndentCaseLabels: false
+KeepEmptyLinesAtTheStartOfBlocks: true
+PackConstructorInitializers: BinPack
+SpacesBeforeTrailingComments: 1
+Standard: Latest
 InsertBraces: true
+SortIncludes: CaseSensitive
+IncludeBlocks: Regroup
+IncludeCategories:
+  # C system headers.
+  - Regex: '^<(assert|complex|ctype|errno|fenv|float|inttypes|iso646|limits|locale|math|setjmp|signal|stdalign|stdarg|stdbool|stddef|stdint|stdio|stdlib|string|tgmath|time|uchar|wchar|wctype)\.h>$'
+    Priority: 1
+  # C++ standard library headers.
+  - Regex: '^<(algorithm|any|array|atomic|barrier|bit|bitset|cassert|ccomplex|cctype|cerrno|cfenv|cfloat|charconv|chrono|cinttypes|ciso646|climits|clocale|cmath|codecvt|compare|complex|concepts|condition_variable|coroutine|csetjmp|csignal|cstdalign|cstdarg|cstdbool|cstddef|cstdint|cstdio|cstdlib|cstring|ctgmath|ctime|cuchar|cwchar|cwctype|deque|exception|execution|expected|filesystem|format|forward_list|fstream|functional|future|initializer_list|iomanip|ios|iosfwd|iostream|istream|iterator|latch|limits|list|locale|map|memory|memory_resource|mutex|new|numbers|numeric|optional|ostream|queue|random|ranges|ratio|regex|scoped_allocator|semaphore|set|shared_mutex|source_location|span|sstream|stack|stdexcept|stop_token|streambuf|string|string_view|strstream|syncstream|system_error|thread|tuple|type_traits|typeindex|typeinfo|unordered_map|unordered_set|utility|valarray|variant|vector|version)>$'
+    Priority: 2
+  # Other external library headers, for example CUDA/MACA/NCCL/MPI.
+  - Regex: '^<.*>$'
+    Priority: 3
+  # vendored third-party headers included with quotes.
+  - Regex: '^"(third_party/|Eigen/|gflags/|glog/)'
+    Priority: 4
+  # Public project interfaces.
+  - Regex: '^"infini_train/include/'
+    Priority: 5
+  # Internal project implementation headers.
+  - Regex: '^"infini_train/src/'
+    Priority: 6
+  # Examples and other local quoted headers.
+  - Regex: '^".*"$'
+    Priority: 7
 BreakBeforeBraces: Custom
 BraceWrapping:
   AfterCaseLabel: false
@@ -28,4 +65,3 @@ BraceWrapping:
   SplitEmptyFunction: true
   SplitEmptyRecord: true
   SplitEmptyNamespace: true
-
diff --git a/.github/workflows/format-check.yaml b/.github/workflows/format-check.yaml
@@ -16,12 +16,43 @@ jobs:
     - name: Checkout code
       uses: actions/checkout@v4
 
+    - name: Install system dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y clang-format-16 include-what-you-use
+
     - name: Install Python dependencies
       run: |
         python3 -m pip install --upgrade pip
-        pip install black
+        pip install black colorama
 
     - name: Run format check
       run: |
         python3 scripts/format.py --path infini_train example --check
 
+    - name: Run custom style check
+      run: |
+        python3 scripts/style_check.py --path infini_train example
+
+    - name: Configure compile database for IWYU
+      # Keep IWYU advisory until the existing codebase is fully cleaned up.
+      continue-on-error: true
+      run: |
+        cmake -S . -B build-iwyu -DUSE_CUDA=OFF -DUSE_MACA=OFF -DUSE_MPI=OFF -DUSE_OMP=OFF -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+
+    - name: Run IWYU check
+      continue-on-error: true
+      run: |
+        if command -v iwyu_tool.py >/dev/null; then
+          IWYU_TOOL="$(command -v iwyu_tool.py)"
+        else
+          IWYU_TOOL="$(command -v iwyu_tool)"
+        fi
+        mapfile -t IWYU_SOURCES < <(
+          find infini_train example -type f \( -name '*.c' -o -name '*.cc' -o -name '*.cpp' -o -name '*.cxx' \) \
+            ! -path 'infini_train/src/core/ccl/cuda/*' \
+            ! -path 'infini_train/src/core/runtime/cuda/*' \
+            ! -path 'infini_train/src/core/ccl/maca/*' \
+            ! -path 'infini_train/src/core/runtime/maca/*'
+        )
+        "${IWYU_TOOL}" -p build-iwyu -j "$(nproc)" "${IWYU_SOURCES[@]}"
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -217,6 +217,11 @@ if(BUILD_TEST)
   add_subdirectory(tests)
 endif()
 
+if(USE_MACA)
+  add_executable(test_maca_allocator test/runtime/test_maca_allocator.cc)
+  link_infini_train_exe(test_maca_allocator)
+endif()
+
 # Negative compile test: missing dtype registration must fail at compile time.
 set(DTYPE_DISPATCH_COMPILE_FAIL_SOURCE
   ${PROJECT_SOURCE_DIR}/tests/dtype/test_dtype_dispatch_compile_fail.cc)

diff --git a/example/common/tokenizer.cc b/example/common/tokenizer.cc
@@ -9,11 +9,12 @@
 
 #include "glog/logging.h"
 
-#include "example/common/utils.h"
 #include "infini_train/include/nn/functional.h"
 #include "infini_train/include/nn/modules/module.h"
 #include "infini_train/include/tensor.h"
 
+#include "example/common/utils.h"
+
 namespace infini_train {
 
 constexpr uint32_t kGpt2Eot = 50256;

diff --git a/example/gpt2/checkpoint_loader.cc b/example/gpt2/checkpoint_loader.cc
@@ -12,8 +12,6 @@
 
 #include "glog/logging.h"
 
-#include "example/common/utils.h"
-#include "example/gpt2/config.h"
 #include "infini_train/include/nn/modules/normalization.h"
 #include "infini_train/include/nn/modules/sparse.h"
 #include "infini_train/include/nn/modules/transformer/causal_self_attention.h"
@@ -24,6 +22,9 @@
 #include "infini_train/include/nn/parallel/tensor_parallel.h"
 #include "infini_train/include/tensor.h"
 
+#include "example/common/utils.h"
+#include "example/gpt2/config.h"
+
 using namespace infini_train;
 namespace nn = infini_train::nn;
 
@@ -101,7 +102,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
     // ========== pp_size：num_stages; vpp_size: num_chunks_per_stage ==========
     int pp_size = nn::parallel::global::GetPipelineParallelSize();
     int vpp_size = nn::parallel::global::GetVirtualPipelineParallelSize();
-    auto pp_rank = nn::parallel::pp_rank;
+    auto pp_rank = nn::parallel::tls_pp_rank;
     auto [is_first_stage, is_last_stage, layer_ranges_per_chunk]
         = nn::parallel::PipelineParallel::GetStageInfo(n_layer, pp_size, pp_rank, vpp_size);
     // ========== layer to chunk ==========
@@ -110,7 +111,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
         for (int i = start; i < end; ++i) { owned_layers[i] = true; }
     }
 
-    auto tp_rank = nn::parallel::tp_rank;
+    auto tp_rank = nn::parallel::tls_tp_rank;
     // calculate xx_size_per_partition
     const int64_t vpp = model_vocab_size / tp_size;
     const int64_t v_start = static_cast<int64_t>(tp_rank) * vpp;

diff --git a/example/gpt2/main.cc b/example/gpt2/main.cc
@@ -137,7 +137,7 @@ void Train(const nn::parallel::Rank &rank) {
 
     // Set thread-local global rank
     // TODO(dcj): Use DeviceGuardImpl to get GlobalRank later.
-    nn::parallel::global::thread_global_rank = rank.GlobalRank();
+    nn::parallel::global::tls_thread_global_rank = rank.GlobalRank();
 
     const ProcessGroup *ddp_pg = nullptr;
     const ProcessGroup *tp_pg = nullptr;
@@ -158,15 +158,14 @@ void Train(const nn::parallel::Rank &rank) {
                                             GetTensorParallelGroupRanks(rank.GlobalRank()));
             tp_rank = tp_pg->GetGroupRank(rank.GlobalRank());
             // NOTE(zbl): Reserved for VocabParallelEmbedding
-            nn::parallel::tp_rank = tp_rank;
+            nn::parallel::tls_tp_rank = tp_rank;
         }
 
         if (pp_world_size > 1) {
             pp_pg = pg_factory->GetOrCreate(GetPipelineParallelProcessGroupName(rank.GlobalRank()),
                                             GetPipelineParallelGroupRanks(rank.GlobalRank()));
             pp_rank = pp_pg->GetGroupRank(rank.GlobalRank());
-
-            nn::parallel::pp_rank = pp_rank;
+            nn::parallel::tls_pp_rank = pp_rank;
         }
     } else {
         device = FLAGS_device == kDeviceCPU ? Device() : Device(Device::DeviceType::kCUDA, 0);

diff --git a/example/llama3/checkpoint_loader.cc b/example/llama3/checkpoint_loader.cc
@@ -12,8 +12,6 @@
 
 #include "glog/logging.h"
 
-#include "example/common/utils.h"
-#include "example/llama3/config.h"
 #include "infini_train/include/nn/modules/normalization.h"
 #include "infini_train/include/nn/modules/transformer/causal_self_attention.h"
 #include "infini_train/include/nn/modules/transformer/mlp.h"
@@ -22,6 +20,9 @@
 #include "infini_train/include/nn/parallel/tensor_parallel.h"
 #include "infini_train/include/tensor.h"
 
+#include "example/common/utils.h"
+#include "example/llama3/config.h"
+
 using namespace infini_train;
 namespace nn = infini_train::nn;
 
@@ -86,7 +87,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
     // ========== pp_size：num_stages; vpp_size: num_chunks_per_stage ==========
     int pp_size = nn::parallel::global::GetPipelineParallelSize();
     int vpp_size = nn::parallel::global::GetVirtualPipelineParallelSize();
-    auto pp_rank = nn::parallel::pp_rank;
+    auto pp_rank = nn::parallel::tls_pp_rank;
     auto [is_first_stage, is_last_stage, layer_ranges_per_chunk]
         = nn::parallel::PipelineParallel::GetStageInfo(n_layer, pp_size, pp_rank, vpp_size);
     // ========== layer to chunk ==========
@@ -96,7 +97,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
     }
 
     const int tp_size = nn::parallel::global::GetTensorParallelSize();
-    const int tp_rank = nn::parallel::tp_rank;
+    const int tp_rank = nn::parallel::tls_tp_rank;
 
     CHECK_EQ(n_embd % tp_size, 0) << "n_embd must be divisible by TP world size.";
     CHECK_EQ(n_head % tp_size, 0) << "n_head must be divisible by TP world size.";

diff --git a/example/llama3/main.cc b/example/llama3/main.cc
@@ -122,7 +122,7 @@ void Train(const nn::parallel::Rank &rank) {
     int pp_rank = 0;
 
     // Set thread-local global rank
-    nn::parallel::global::thread_global_rank = rank.GlobalRank();
+    nn::parallel::global::tls_thread_global_rank = rank.GlobalRank();
 
     const ProcessGroup *ddp_pg = nullptr;
     const ProcessGroup *tp_pg = nullptr;
@@ -143,15 +143,14 @@ void Train(const nn::parallel::Rank &rank) {
                                             GetTensorParallelGroupRanks(rank.GlobalRank()));
             tp_rank = tp_pg->GetGroupRank(rank.GlobalRank());
             // NOTE(zbl): Reserved for VocabParallelEmbedding
-            nn::parallel::tp_rank = tp_rank;
+            nn::parallel::tls_tp_rank = tp_rank;
         }
 
         if (pp_world_size > 1) {
             pp_pg = pg_factory->GetOrCreate(GetPipelineParallelProcessGroupName(rank.GlobalRank()),
                                             GetPipelineParallelGroupRanks(rank.GlobalRank()));
             pp_rank = pp_pg->GetGroupRank(rank.GlobalRank());
-
-            nn::parallel::pp_rank = pp_rank;
+            nn::parallel::tls_pp_rank = pp_rank;
         }
     } else {
         device = FLAGS_device == kDeviceCPU ? Device() : Device(Device::DeviceType::kCUDA, 0);

diff --git a/infini_train/include/autograd/grad_mode.h b/infini_train/include/autograd/grad_mode.h
@@ -5,13 +5,12 @@ namespace infini_train::autograd {
 
 class GradMode {
 public:
-    // Whether to enable Autograd (enabled by default)
-    static bool IsEnabled() { return grad_enabled_; }
-    static void SetEnabled(bool enabled) { grad_enabled_ = enabled; }
+    // Whether to enable Autograd (enabled by default).
+    static bool IsEnabled() { return tls_grad_enabled_; }
+    static void SetEnabled(bool enabled) { tls_grad_enabled_ = enabled; }
 
 private:
-    // grad mode should be thread_local
-    static thread_local bool grad_enabled_;
+    static thread_local bool tls_grad_enabled_;
 };
 
 // RAII: Disable grad (align with torch.no_grad)

diff --git a/infini_train/include/nn/parallel/global.h b/infini_train/include/nn/parallel/global.h
@@ -6,7 +6,7 @@
 
 namespace infini_train::nn::parallel::global {
 
-extern thread_local int thread_global_rank;
+extern thread_local int tls_thread_global_rank;
 
 enum Axis : uint8_t { DP = 0, TP = 1, PP = 2, AXIS_COUNT = 3 };
 

diff --git a/infini_train/include/nn/parallel/pp/pipeline_parallel.h b/infini_train/include/nn/parallel/pp/pipeline_parallel.h
@@ -16,7 +16,7 @@ namespace infini_train::nn::parallel {
 class PipelineStage;
 class PipelineSchedule;
 
-extern thread_local int pp_rank;
+extern thread_local int tls_pp_rank;
 
 struct StageInfo {
     bool is_first_stage;

diff --git a/infini_train/include/nn/parallel/tensor_parallel.h b/infini_train/include/nn/parallel/tensor_parallel.h
@@ -16,7 +16,7 @@ namespace infini_train::nn::parallel {
 
 // NOTE(zbl): Reserved for VocabParallelEmbedding, since rank is needed in its constructor before any Device exists
 //            On other occasions, should use Device::Rank()
-extern thread_local int tp_rank;
+extern thread_local int tls_tp_rank;
 
 class ColumnParallelLinear : public nn::CloneableModule<ColumnParallelLinear> {
 public:

diff --git a/infini_train/include/profiler.h b/infini_train/include/profiler.h
@@ -17,23 +17,23 @@ namespace core {
 class Event;
 }
 
-inline thread_local int g_profiling_depth = 0;
+inline thread_local int tls_profiling_depth = 0;
 
 struct ProfileContext {
     std::string name;
     Device::DeviceType device;
 };
 
-inline thread_local ProfileContext g_profile_context;
+inline thread_local ProfileContext tls_profile_context;
 
 inline void SetProfileContext(const std::string &name, Device::DeviceType device) {
-    if (g_profiling_depth == 0) {
-        g_profile_context.name = name;
-        g_profile_context.device = device;
+    if (tls_profiling_depth == 0) {
+        tls_profile_context.name = name;
+        tls_profile_context.device = device;
     }
 }
 
-inline const ProfileContext &GetProfileContext() { return g_profile_context; }
+inline const ProfileContext &GetProfileContext() { return tls_profile_context; }
 
 struct KernelProfileInfo {
     int64_t host_total_us = 0;
@@ -89,13 +89,14 @@ class Profiler {
     std::string current_tag_ = "Untagged";
 
     // thread-local tracking
-    thread_local static inline std::map<std::string, std::chrono::high_resolution_clock::time_point> cpu_timing_map_;
+    thread_local static inline std::map<std::string, std::chrono::high_resolution_clock::time_point>
+        tls_cpu_timing_map_;
 
     struct EventPair {
         core::Event *start = nullptr;
         core::Event *stop = nullptr;
     };
 
-    thread_local static inline std::map<std::string, EventPair> device_timing_map_;
+    thread_local static inline std::map<std::string, EventPair> tls_device_timing_map_;
 };
 } // namespace infini_train
diff --git a/infini_train/include/utils/global_module_hook_registry.h b/infini_train/include/utils/global_module_hook_registry.h
@@ -1,12 +1,13 @@
 #pragma once
 
-#include "infini_train/include/common/hook.h"
-#include "infini_train/include/tensor.h"
 #include <functional>
 #include <memory>
 #include <mutex>
 #include <vector>
 
+#include "infini_train/include/common/hook.h"
+#include "infini_train/include/tensor.h"
+
 namespace infini_train {
 namespace nn {
 class Module;

diff --git a/infini_train/src/autograd/grad_mode.cc b/infini_train/src/autograd/grad_mode.cc
@@ -1,5 +1,5 @@
 #include "infini_train/include/autograd/grad_mode.h"
 
 namespace infini_train::autograd {
-thread_local bool GradMode::grad_enabled_ = true;
+thread_local bool GradMode::tls_grad_enabled_ = true;
 } // namespace infini_train::autograd
diff --git a/infini_train/src/core/ccl/cuda/nccl_impl.cc b/infini_train/src/core/ccl/cuda/nccl_impl.cc
@@ -1,8 +1,9 @@
 #include "infini_train/src/core/ccl/cuda/nccl_impl.h"
 
-#include <nccl.h>
 #include <vector>
 
+#include <nccl.h>
+
 #include "glog/logging.h"
 
 #include "infini_train/include/common/cuda/common_cuda.h"

diff --git a/infini_train/src/kernels/cpu/embedding.cc b/infini_train/src/kernels/cpu/embedding.cc
@@ -46,7 +46,7 @@ std::shared_ptr<Tensor> EmbeddingBackward(const std::shared_ptr<Tensor> &input,
     for (int i = 0; i < input->NumElements(); ++i) {
         int idx = static_cast<int>(static_cast<const int64_t *>(input->DataPtr())[i]);
         for (int j = 0; j < embedding_dim; ++j) {
-            static_cast<float *>(grad_weight->DataPtr())[idx * embedding_dim + j] // <-- 修复这里
+            static_cast<float *>(grad_weight->DataPtr())[idx * embedding_dim + j]
                 += static_cast<const float *>(grad_output->DataPtr())[i * embedding_dim + j];
         }
     }

diff --git a/infini_train/src/kernels/cpu/linear.cc b/infini_train/src/kernels/cpu/linear.cc
@@ -1,3 +1,5 @@
+#include "infini_train/include/autograd/linear.h"
+
 #include <cstdint>
 #include <memory>
 #include <numeric>