Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 39 additions & 3 deletions .clang-format
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个文件不要动。

Original file line number Diff line number Diff line change
@@ -1,13 +1,50 @@
---
BasedOnStyle: LLVM
BasedOnStyle: Google
IndentWidth: 4
AccessModifierOffset: -4
PointerAlignment: Right
DerivePointerAlignment: false
AlignEscapedNewlines: Right
AlignOperands: AlignAfterOperator
AlwaysBreakBeforeMultilineStrings: false
AlwaysBreakTemplateDeclarations: MultiLine
BreakBeforeBinaryOperators: All
ColumnLimit: 120
AllowShortBlocksOnASingleLine: Always
PenaltyBreakBeforeFirstCallParameter: 19
PenaltyReturnTypeOnItsOwnLine: 60
AllowShortBlocksOnASingleLine: Always
AllowShortIfStatementsOnASingleLine: Never
AllowShortLoopsOnASingleLine: true
IndentCaseLabels: false
KeepEmptyLinesAtTheStartOfBlocks: true
PackConstructorInitializers: BinPack
SpacesBeforeTrailingComments: 1
Standard: Latest
InsertBraces: true
SortIncludes: CaseSensitive
IncludeBlocks: Regroup
IncludeCategories:
# C system headers.
- Regex: '^<(assert|complex|ctype|errno|fenv|float|inttypes|iso646|limits|locale|math|setjmp|signal|stdalign|stdarg|stdbool|stddef|stdint|stdio|stdlib|string|tgmath|time|uchar|wchar|wctype)\.h>$'
Priority: 1
# C++ standard library headers.
- Regex: '^<(algorithm|any|array|atomic|barrier|bit|bitset|cassert|ccomplex|cctype|cerrno|cfenv|cfloat|charconv|chrono|cinttypes|ciso646|climits|clocale|cmath|codecvt|compare|complex|concepts|condition_variable|coroutine|csetjmp|csignal|cstdalign|cstdarg|cstdbool|cstddef|cstdint|cstdio|cstdlib|cstring|ctgmath|ctime|cuchar|cwchar|cwctype|deque|exception|execution|expected|filesystem|format|forward_list|fstream|functional|future|initializer_list|iomanip|ios|iosfwd|iostream|istream|iterator|latch|limits|list|locale|map|memory|memory_resource|mutex|new|numbers|numeric|optional|ostream|queue|random|ranges|ratio|regex|scoped_allocator|semaphore|set|shared_mutex|source_location|span|sstream|stack|stdexcept|stop_token|streambuf|string|string_view|strstream|syncstream|system_error|thread|tuple|type_traits|typeindex|typeinfo|unordered_map|unordered_set|utility|valarray|variant|vector|version)>$'
Priority: 2
# Other external library headers, for example CUDA/MACA/NCCL/MPI.
- Regex: '^<.*>$'
Priority: 3
# vendored third-party headers included with quotes.
- Regex: '^"(third_party/|Eigen/|gflags/|glog/)'
Priority: 4
# Public project interfaces.
- Regex: '^"infini_train/include/'
Priority: 5
# Internal project implementation headers.
- Regex: '^"infini_train/src/'
Priority: 6
# Examples and other local quoted headers.
- Regex: '^".*"$'
Priority: 7
BreakBeforeBraces: Custom
BraceWrapping:
AfterCaseLabel: false
Expand All @@ -28,4 +65,3 @@ BraceWrapping:
SplitEmptyFunction: true
SplitEmptyRecord: true
SplitEmptyNamespace: true

33 changes: 32 additions & 1 deletion .github/workflows/format-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,43 @@ jobs:
- name: Checkout code
uses: actions/checkout@v4

- name: Install system dependencies
run: |
sudo apt-get update
sudo apt-get install -y clang-format-16 include-what-you-use

- name: Install Python dependencies
run: |
python3 -m pip install --upgrade pip
pip install black
pip install black colorama

- name: Run format check
run: |
python3 scripts/format.py --path infini_train example --check

- name: Run custom style check
run: |
python3 scripts/style_check.py --path infini_train example

- name: Configure compile database for IWYU
# Keep IWYU advisory until the existing codebase is fully cleaned up.
continue-on-error: true
run: |
cmake -S . -B build-iwyu -DUSE_CUDA=OFF -DUSE_MACA=OFF -DUSE_MPI=OFF -DUSE_OMP=OFF -DCMAKE_EXPORT_COMPILE_COMMANDS=ON

- name: Run IWYU check
continue-on-error: true
run: |
if command -v iwyu_tool.py >/dev/null; then
IWYU_TOOL="$(command -v iwyu_tool.py)"
else
IWYU_TOOL="$(command -v iwyu_tool)"
fi
mapfile -t IWYU_SOURCES < <(
find infini_train example -type f \( -name '*.c' -o -name '*.cc' -o -name '*.cpp' -o -name '*.cxx' \) \
! -path 'infini_train/src/core/ccl/cuda/*' \
! -path 'infini_train/src/core/runtime/cuda/*' \
! -path 'infini_train/src/core/ccl/maca/*' \
! -path 'infini_train/src/core/runtime/maca/*'
)
"${IWYU_TOOL}" -p build-iwyu -j "$(nproc)" "${IWYU_SOURCES[@]}"
5 changes: 5 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,11 @@ if(BUILD_TEST)
add_subdirectory(tests)
endif()

if(USE_MACA)
add_executable(test_maca_allocator test/runtime/test_maca_allocator.cc)
link_infini_train_exe(test_maca_allocator)
endif()

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

去掉。

# Negative compile test: missing dtype registration must fail at compile time.
set(DTYPE_DISPATCH_COMPILE_FAIL_SOURCE
${PROJECT_SOURCE_DIR}/tests/dtype/test_dtype_dispatch_compile_fail.cc)
Expand Down
3 changes: 2 additions & 1 deletion example/common/tokenizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@

#include "glog/logging.h"

#include "example/common/utils.h"
#include "infini_train/include/nn/functional.h"
#include "infini_train/include/nn/modules/module.h"
#include "infini_train/include/tensor.h"

#include "example/common/utils.h"

namespace infini_train {

constexpr uint32_t kGpt2Eot = 50256;
Expand Down
9 changes: 5 additions & 4 deletions example/gpt2/checkpoint_loader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@

#include "glog/logging.h"

#include "example/common/utils.h"
#include "example/gpt2/config.h"
#include "infini_train/include/nn/modules/normalization.h"
#include "infini_train/include/nn/modules/sparse.h"
#include "infini_train/include/nn/modules/transformer/causal_self_attention.h"
Expand All @@ -24,6 +22,9 @@
#include "infini_train/include/nn/parallel/tensor_parallel.h"
#include "infini_train/include/tensor.h"

#include "example/common/utils.h"
#include "example/gpt2/config.h"

using namespace infini_train;
namespace nn = infini_train::nn;

Expand Down Expand Up @@ -101,7 +102,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
// ========== pp_size:num_stages; vpp_size: num_chunks_per_stage ==========
int pp_size = nn::parallel::global::GetPipelineParallelSize();
int vpp_size = nn::parallel::global::GetVirtualPipelineParallelSize();
auto pp_rank = nn::parallel::pp_rank;
auto pp_rank = nn::parallel::tls_pp_rank;
auto [is_first_stage, is_last_stage, layer_ranges_per_chunk]
= nn::parallel::PipelineParallel::GetStageInfo(n_layer, pp_size, pp_rank, vpp_size);
// ========== layer to chunk ==========
Expand All @@ -110,7 +111,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
for (int i = start; i < end; ++i) { owned_layers[i] = true; }
}

auto tp_rank = nn::parallel::tp_rank;
auto tp_rank = nn::parallel::tls_tp_rank;
// calculate xx_size_per_partition
const int64_t vpp = model_vocab_size / tp_size;
const int64_t v_start = static_cast<int64_t>(tp_rank) * vpp;
Expand Down
7 changes: 3 additions & 4 deletions example/gpt2/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ void Train(const nn::parallel::Rank &rank) {

// Set thread-local global rank
// TODO(dcj): Use DeviceGuardImpl to get GlobalRank later.
nn::parallel::global::thread_global_rank = rank.GlobalRank();
nn::parallel::global::tls_thread_global_rank = rank.GlobalRank();

const ProcessGroup *ddp_pg = nullptr;
const ProcessGroup *tp_pg = nullptr;
Expand All @@ -158,15 +158,14 @@ void Train(const nn::parallel::Rank &rank) {
GetTensorParallelGroupRanks(rank.GlobalRank()));
tp_rank = tp_pg->GetGroupRank(rank.GlobalRank());
// NOTE(zbl): Reserved for VocabParallelEmbedding
nn::parallel::tp_rank = tp_rank;
nn::parallel::tls_tp_rank = tp_rank;
}

if (pp_world_size > 1) {
pp_pg = pg_factory->GetOrCreate(GetPipelineParallelProcessGroupName(rank.GlobalRank()),
GetPipelineParallelGroupRanks(rank.GlobalRank()));
pp_rank = pp_pg->GetGroupRank(rank.GlobalRank());

nn::parallel::pp_rank = pp_rank;
nn::parallel::tls_pp_rank = pp_rank;
}
} else {
device = FLAGS_device == kDeviceCPU ? Device() : Device(Device::DeviceType::kCUDA, 0);
Expand Down
9 changes: 5 additions & 4 deletions example/llama3/checkpoint_loader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@

#include "glog/logging.h"

#include "example/common/utils.h"
#include "example/llama3/config.h"
#include "infini_train/include/nn/modules/normalization.h"
#include "infini_train/include/nn/modules/transformer/causal_self_attention.h"
#include "infini_train/include/nn/modules/transformer/mlp.h"
Expand All @@ -22,6 +20,9 @@
#include "infini_train/include/nn/parallel/tensor_parallel.h"
#include "infini_train/include/tensor.h"

#include "example/common/utils.h"
#include "example/llama3/config.h"

using namespace infini_train;
namespace nn = infini_train::nn;

Expand Down Expand Up @@ -86,7 +87,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
// ========== pp_size:num_stages; vpp_size: num_chunks_per_stage ==========
int pp_size = nn::parallel::global::GetPipelineParallelSize();
int vpp_size = nn::parallel::global::GetVirtualPipelineParallelSize();
auto pp_rank = nn::parallel::pp_rank;
auto pp_rank = nn::parallel::tls_pp_rank;
auto [is_first_stage, is_last_stage, layer_ranges_per_chunk]
= nn::parallel::PipelineParallel::GetStageInfo(n_layer, pp_size, pp_rank, vpp_size);
// ========== layer to chunk ==========
Expand All @@ -96,7 +97,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
}

const int tp_size = nn::parallel::global::GetTensorParallelSize();
const int tp_rank = nn::parallel::tp_rank;
const int tp_rank = nn::parallel::tls_tp_rank;

CHECK_EQ(n_embd % tp_size, 0) << "n_embd must be divisible by TP world size.";
CHECK_EQ(n_head % tp_size, 0) << "n_head must be divisible by TP world size.";
Expand Down
7 changes: 3 additions & 4 deletions example/llama3/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ void Train(const nn::parallel::Rank &rank) {
int pp_rank = 0;

// Set thread-local global rank
nn::parallel::global::thread_global_rank = rank.GlobalRank();
nn::parallel::global::tls_thread_global_rank = rank.GlobalRank();

const ProcessGroup *ddp_pg = nullptr;
const ProcessGroup *tp_pg = nullptr;
Expand All @@ -143,15 +143,14 @@ void Train(const nn::parallel::Rank &rank) {
GetTensorParallelGroupRanks(rank.GlobalRank()));
tp_rank = tp_pg->GetGroupRank(rank.GlobalRank());
// NOTE(zbl): Reserved for VocabParallelEmbedding
nn::parallel::tp_rank = tp_rank;
nn::parallel::tls_tp_rank = tp_rank;
}

if (pp_world_size > 1) {
pp_pg = pg_factory->GetOrCreate(GetPipelineParallelProcessGroupName(rank.GlobalRank()),
GetPipelineParallelGroupRanks(rank.GlobalRank()));
pp_rank = pp_pg->GetGroupRank(rank.GlobalRank());

nn::parallel::pp_rank = pp_rank;
nn::parallel::tls_pp_rank = pp_rank;
}
} else {
device = FLAGS_device == kDeviceCPU ? Device() : Device(Device::DeviceType::kCUDA, 0);
Expand Down
9 changes: 4 additions & 5 deletions infini_train/include/autograd/grad_mode.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,12 @@ namespace infini_train::autograd {

class GradMode {
public:
// Whether to enable Autograd (enabled by default)
static bool IsEnabled() { return grad_enabled_; }
static void SetEnabled(bool enabled) { grad_enabled_ = enabled; }
// Whether to enable Autograd (enabled by default).
static bool IsEnabled() { return tls_grad_enabled_; }
static void SetEnabled(bool enabled) { tls_grad_enabled_ = enabled; }

private:
// grad mode should be thread_local
static thread_local bool grad_enabled_;
static thread_local bool tls_grad_enabled_;
};

// RAII: Disable grad (align with torch.no_grad)
Expand Down
2 changes: 1 addition & 1 deletion infini_train/include/nn/parallel/global.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

namespace infini_train::nn::parallel::global {

extern thread_local int thread_global_rank;
extern thread_local int tls_thread_global_rank;

enum Axis : uint8_t { DP = 0, TP = 1, PP = 2, AXIS_COUNT = 3 };

Expand Down
2 changes: 1 addition & 1 deletion infini_train/include/nn/parallel/pp/pipeline_parallel.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ namespace infini_train::nn::parallel {
class PipelineStage;
class PipelineSchedule;

extern thread_local int pp_rank;
extern thread_local int tls_pp_rank;

struct StageInfo {
bool is_first_stage;
Expand Down
2 changes: 1 addition & 1 deletion infini_train/include/nn/parallel/tensor_parallel.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ namespace infini_train::nn::parallel {

// NOTE(zbl): Reserved for VocabParallelEmbedding, since rank is needed in its constructor before any Device exists
// On other occasions, should use Device::Rank()
extern thread_local int tp_rank;
extern thread_local int tls_tp_rank;

class ColumnParallelLinear : public nn::CloneableModule<ColumnParallelLinear> {
public:
Expand Down
17 changes: 9 additions & 8 deletions infini_train/include/profiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,23 @@ namespace core {
class Event;
}

inline thread_local int g_profiling_depth = 0;
inline thread_local int tls_profiling_depth = 0;

struct ProfileContext {
std::string name;
Device::DeviceType device;
};

inline thread_local ProfileContext g_profile_context;
inline thread_local ProfileContext tls_profile_context;

inline void SetProfileContext(const std::string &name, Device::DeviceType device) {
if (g_profiling_depth == 0) {
g_profile_context.name = name;
g_profile_context.device = device;
if (tls_profiling_depth == 0) {
tls_profile_context.name = name;
tls_profile_context.device = device;
}
}

inline const ProfileContext &GetProfileContext() { return g_profile_context; }
inline const ProfileContext &GetProfileContext() { return tls_profile_context; }

struct KernelProfileInfo {
int64_t host_total_us = 0;
Expand Down Expand Up @@ -89,13 +89,14 @@ class Profiler {
std::string current_tag_ = "Untagged";

// thread-local tracking
thread_local static inline std::map<std::string, std::chrono::high_resolution_clock::time_point> cpu_timing_map_;
thread_local static inline std::map<std::string, std::chrono::high_resolution_clock::time_point>
tls_cpu_timing_map_;

struct EventPair {
core::Event *start = nullptr;
core::Event *stop = nullptr;
};

thread_local static inline std::map<std::string, EventPair> device_timing_map_;
thread_local static inline std::map<std::string, EventPair> tls_device_timing_map_;
};
} // namespace infini_train
5 changes: 3 additions & 2 deletions infini_train/include/utils/global_module_hook_registry.h
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
#pragma once

#include "infini_train/include/common/hook.h"
#include "infini_train/include/tensor.h"
#include <functional>
#include <memory>
#include <mutex>
#include <vector>

#include "infini_train/include/common/hook.h"
#include "infini_train/include/tensor.h"

namespace infini_train {
namespace nn {
class Module;
Expand Down
2 changes: 1 addition & 1 deletion infini_train/src/autograd/grad_mode.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#include "infini_train/include/autograd/grad_mode.h"

namespace infini_train::autograd {
thread_local bool GradMode::grad_enabled_ = true;
thread_local bool GradMode::tls_grad_enabled_ = true;
} // namespace infini_train::autograd
3 changes: 2 additions & 1 deletion infini_train/src/core/ccl/cuda/nccl_impl.cc
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#include "infini_train/src/core/ccl/cuda/nccl_impl.h"

#include <nccl.h>
#include <vector>

#include <nccl.h>
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

c 语言系统库头文件应当放在 c++标准库头文件上一个分组。


#include "glog/logging.h"

#include "infini_train/include/common/cuda/common_cuda.h"
Expand Down
2 changes: 1 addition & 1 deletion infini_train/src/kernels/cpu/embedding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ std::shared_ptr<Tensor> EmbeddingBackward(const std::shared_ptr<Tensor> &input,
for (int i = 0; i < input->NumElements(); ++i) {
int idx = static_cast<int>(static_cast<const int64_t *>(input->DataPtr())[i]);
for (int j = 0; j < embedding_dim; ++j) {
static_cast<float *>(grad_weight->DataPtr())[idx * embedding_dim + j] // <-- 修复这里
static_cast<float *>(grad_weight->DataPtr())[idx * embedding_dim + j]
+= static_cast<const float *>(grad_output->DataPtr())[i * embedding_dim + j];
}
}
Expand Down
2 changes: 2 additions & 0 deletions infini_train/src/kernels/cpu/linear.cc
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#include "infini_train/include/autograd/linear.h"
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个不是配套头文件,不需要放首行,以及理论上 kernel 层不应当引用 autograd 层头文件。


#include <cstdint>
#include <memory>
#include <numeric>
Expand Down
Loading
Loading