diff --git a/scripts/compare_loss.py b/scripts/compare_loss.py
index 8b581266..31b2a009 100755
--- a/scripts/compare_loss.py
+++ b/scripts/compare_loss.py
@@ -9,6 +9,7 @@
 import sys
 from pathlib import Path
 from argparse import ArgumentParser
+from compare_utils import collect_log_files, exit_if_duplicate_logs
 
 def get_dtype_from_filename(filename):
     """Determine dtype from filename. Returns 'bfloat16' or 'fp32'."""
@@ -62,8 +63,10 @@ def main():
         args.threshold_fp32 = args.threshold
         args.threshold_bf16 = args.threshold
 
-    files1 = {f.name: f for f in args.dir1.glob('*.log') if not f.name.startswith('build')}
-    files2 = {f.name: f for f in args.dir2.glob('*.log') if not f.name.startswith('build')}
+    files1, duplicates1 = collect_log_files(args.dir1)
+    files2, duplicates2 = collect_log_files(args.dir2)
+    exit_if_duplicate_logs(args.dir1, duplicates1)
+    exit_if_duplicate_logs(args.dir2, duplicates2)
 
     only_in_1 = set(files1.keys()) - set(files2.keys())
     only_in_2 = set(files2.keys()) - set(files1.keys())
diff --git a/scripts/compare_tps.py b/scripts/compare_tps.py
index 270b1ddd..de6327de 100755
--- a/scripts/compare_tps.py
+++ b/scripts/compare_tps.py
@@ -9,6 +9,7 @@
 import sys
 from pathlib import Path
 from argparse import ArgumentParser
+from compare_utils import collect_log_files, exit_if_duplicate_logs
 
 def parse_log(file_path):
     """Extract step -> tok/s mapping from log file."""
@@ -55,8 +56,10 @@ def main():
     parser.add_argument('--verbose', action='store_true', help='Print detailed output for all files, including passed ones')
     args = parser.parse_args()
 
-    files1 = {f.name: f for f in args.dir1.glob('*.log') if not f.name.startswith('build')}
-    files2 = {f.name: f for f in args.dir2.glob('*.log') if not f.name.startswith('build')}
+    files1, duplicates1 = collect_log_files(args.dir1)
+    files2, duplicates2 = collect_log_files(args.dir2)
+    exit_if_duplicate_logs(args.dir1, duplicates1)
+    exit_if_duplicate_logs(args.dir2, duplicates2)
 
     only_in_1 = set(files1.keys()) - set(files2.keys())
     only_in_2 = set(files2.keys()) - set(files1.keys())
diff --git a/scripts/compare_utils.py b/scripts/compare_utils.py
new file mode 100644
index 00000000..0831f7be
--- /dev/null
+++ b/scripts/compare_utils.py
@@ -0,0 +1,31 @@
+from pathlib import Path
+import sys
+
+
+def collect_log_files(base_dir: Path):
+    """Collect comparable training logs keyed by basename."""
+    files = {}
+    duplicates = {}
+
+    for path in base_dir.rglob("*.log"):
+        if path.name.startswith("build") or path.name.endswith("_profile.log"):
+            continue
+
+        key = path.name
+        if key in files:
+            duplicates.setdefault(key, [files[key]]).append(path)
+            continue
+        files[key] = path
+
+    return files, duplicates
+
+
+def exit_if_duplicate_logs(base_dir: Path, duplicates):
+    """Abort when duplicate basenames make comparison ambiguous."""
+    if not duplicates:
+        return
+
+    print(f"Found duplicate log basenames in {base_dir.resolve()}, cannot compare safely:")
+    for name, paths in sorted(duplicates.items()):
+        print(f"  {name}: {', '.join(str(p.relative_to(base_dir)) for p in paths)}")
+    sys.exit(1)
diff --git a/scripts/run_models_and_profile.bash b/scripts/run_models_and_profile.bash
index 1cf27935..b183a936 100755
--- a/scripts/run_models_and_profile.bash
+++ b/scripts/run_models_and_profile.bash
@@ -3,7 +3,56 @@
 set -e
 set -o pipefail
 
-CONFIG_FILE="${1:-test_config.json}"
+usage() {
+    cat <<'EOF'
+Usage: run_models_and_profile.bash [--test-config path] [--only-run tag1,tag2]
+
+Options:
+  --test-config PATH  Path to test config JSON. Default: test_config.json.
+  --only-run TAGS   Only run the specified tag groups, separated by commas.
+  -h, --help        Show this help message.
+EOF
+}
+
+CONFIG_FILE="test_config.json"
+ONLY_RUN_TAGS=""
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --test-config)
+            [[ $# -lt 2 ]] && { echo "Error: --test-config requires a file path."; exit 1; }
+            CONFIG_FILE="$2"
+            shift 2
+            ;;
+        --test-config=*)
+            CONFIG_FILE="${1#*=}"
+            shift
+            ;;
+        --only-run)
+            [[ $# -lt 2 ]] && { echo "Error: --only-run requires a comma-separated tag list."; exit 1; }
+            ONLY_RUN_TAGS="$2"
+            shift 2
+            ;;
+        --only-run=*)
+            ONLY_RUN_TAGS="${1#*=}"
+            shift
+            ;;
+        -h|--help)
+            usage
+            exit 0
+            ;;
+        -*)
+            echo "Error: Unknown option: $1"
+            usage
+            exit 1
+            ;;
+        *)
+            echo "Error: Unknown positional argument: $1"
+            usage
+            exit 1
+            ;;
+    esac
+done
 
 # Dependencies check
 if ! command -v jq >/dev/null 2>&1; then
@@ -33,6 +82,28 @@ done < <(jq -r '.variables | to_entries[] | "\(.key)=\(.value)"' "$CONFIG_FILE")
 
 # Global variable to save the last cmake command
 LAST_CMAKE_CMD=""
+declare -A SELECTED_TAGS=()
+
+normalize_tag() {
+    local raw="$1"
+    raw="${raw#"${raw%%[![:space:]]*}"}"
+    raw="${raw%"${raw##*[![:space:]]}"}"
+    printf '%s' "$raw"
+}
+
+if [[ -n "$ONLY_RUN_TAGS" ]]; then
+    IFS=',' read -r -a requested_tags <<< "$ONLY_RUN_TAGS"
+    for raw_tag in "${requested_tags[@]}"; do
+        tag="$(normalize_tag "$raw_tag")"
+        [[ -z "$tag" ]] && continue
+        SELECTED_TAGS["$tag"]=1
+    done
+
+    if [[ ${#SELECTED_TAGS[@]} -eq 0 ]]; then
+        echo "Error: --only-run did not contain any valid tags."
+        exit 1
+    fi
+fi
 
 # Clean the build directory
 clean_build_dir() {
@@ -46,9 +117,12 @@ run_and_log() {
     local cmd="$1"
     local log_name="$2"
     local is_profile="$3"
+    local tag="${4:-basic}"
     local timestamp
     timestamp=$(date '+%Y-%m-%d %H:%M:%S')
-    local log_path="$(realpath "${LOG_DIR}/${log_name}.log")"
+    local tag_log_dir="${LOG_DIR}/${tag}"
+    mkdir -p "$tag_log_dir"
+    local log_path="$(realpath "${tag_log_dir}/${log_name}.log")"
 
     echo -e "\033[1;32m============================================================\033[0m"
     echo -e "\033[1;36m[$timestamp] [Running] ${log_name}\033[0m"
@@ -99,7 +173,7 @@ run_and_log() {
 
     # If profiling is enabled, move profiling files to the target directory
     if [[ "$is_profile" == "yes" ]]; then
-        move_profile_logs "$log_name"
+        move_profile_logs "$log_name" "$tag"
     fi
 }
 
@@ -107,14 +181,17 @@ run_and_log() {
 # Move profiling output logs
 move_profile_logs() {
     local prefix="$1"
+    local tag="${2:-basic}"
+    local tag_profile_dir="${PROFILE_LOG_DIR}/${tag}"
+    mkdir -p "$tag_profile_dir"
 
     # Move *.report.rankN files
     for report_file in "${BUILD_DIR}"/*.report.rank*; do
         if [[ -f "$report_file" ]]; then
             local base_name
             base_name=$(basename "$report_file")
-            mv "$report_file" "${PROFILE_LOG_DIR}/${prefix}_${base_name}"
-            echo "Moved $base_name to ${PROFILE_LOG_DIR}/${prefix}_${base_name}"
+            mv "$report_file" "${tag_profile_dir}/${prefix}_${base_name}"
+            echo "Moved $base_name to ${tag_profile_dir}/${prefix}_${base_name}"
         fi
     done
 
@@ -123,17 +200,18 @@ move_profile_logs() {
         if [[ -f "$record_file" ]]; then
             local base_name
             base_name=$(basename "$record_file")
-            mv "$record_file" "${PROFILE_LOG_DIR}/${prefix}_${base_name}"
-            echo "Moved $base_name to ${PROFILE_LOG_DIR}/${prefix}_${base_name}"
+            mv "$record_file" "${tag_profile_dir}/${prefix}_${base_name}"
+            echo "Moved $base_name to ${tag_profile_dir}/${prefix}_${base_name}"
         fi
     done
 }
 
-# Build "--key value" arg string from tests[i].args (shell-escaped)
+# Build "--key value" arg string from test_groups[gi].tests[ti].args (shell-escaped)
 args_string_for_test() {
-    local idx="$1"
-    jq -r --argjson i "$idx" '
-      .tests[$i].args
+    local group_idx="$1"
+    local test_idx="$2"
+    jq -r --argjson g "$group_idx" --argjson t "$test_idx" '
+      .test_groups[$g].tests[$t].args
       | to_entries[]
       | "--\(.key) \(.value|tostring)"
     ' "$CONFIG_FILE" | paste -sd' ' -
@@ -141,7 +219,20 @@ args_string_for_test() {
 
 # Run tests
 num_builds=$(jq '.builds | length' "$CONFIG_FILE")
-num_tests=$(jq '.tests  | length' "$CONFIG_FILE")
+num_groups=$(jq '.test_groups | length' "$CONFIG_FILE")
+
+selected_group_count=0
+for ((gi=0; gi<num_groups; ++gi)); do
+    group_tag=$(jq -r ".test_groups[$gi].tag" "$CONFIG_FILE")
+    if [[ ${#SELECTED_TAGS[@]} -eq 0 || -n "${SELECTED_TAGS[$group_tag]}" ]]; then
+        ((selected_group_count += 1))
+    fi
+done
+
+if [[ "$selected_group_count" -eq 0 ]]; then
+    echo "Error: No matching test groups found for --only-run=${ONLY_RUN_TAGS}"
+    exit 1
+fi
 
 for ((id=0; id<num_builds; ++id)); do
     build_id=$(jq -r ".builds[$id].id" "$CONFIG_FILE")
@@ -152,7 +243,7 @@ for ((id=0; id<num_builds; ++id)); do
 
     # always clean before another build
     clean_build_dir
-    run_and_log "$LAST_CMAKE_CMD" "${build_id}" "no"
+    run_and_log "$LAST_CMAKE_CMD" "${build_id}" "no" "build"
 
     # profile flag for runs
     profile_flag="no"
@@ -162,17 +253,27 @@ for ((id=0; id<num_builds; ++id)); do
         log_suffix="_profile"
     fi
 
-    for ((ti=0; ti<num_tests; ++ti)); do
-        test_id=$(jq -r ".tests[$ti].id" "$CONFIG_FILE")
-        arg_str="$(args_string_for_test "$ti")"
+    for ((gi=0; gi<num_groups; ++gi)); do
+        group_tag=$(jq -r ".test_groups[$gi].tag" "$CONFIG_FILE")
+        if [[ ${#SELECTED_TAGS[@]} -gt 0 && -z "${SELECTED_TAGS[$group_tag]}" ]]; then
+            continue
+        fi
+
+        num_tests=$(jq ".test_groups[$gi].tests | length" "$CONFIG_FILE")
+        echo -e "\033[1;36m[TEST GROUP] tag=${group_tag}, cases=${num_tests}\033[0m"
 
-        # gpt2
-        gpt2_cmd="${prefix}./gpt2 --input_bin ${GPT2_INPUT_BIN} --llmc_filepath ${GPT2_LLMC_FILEPATH} --device cuda ${arg_str}"
-        run_and_log "$gpt2_cmd" "gpt2_${test_id}${log_suffix}" "$profile_flag"
+        for ((ti=0; ti<num_tests; ++ti)); do
+            test_id=$(jq -r ".test_groups[$gi].tests[$ti].id" "$CONFIG_FILE")
+            arg_str="$(args_string_for_test "$gi" "$ti")"
 
-        # llama3
-        llama3_cmd="${prefix}./llama3 --input_bin ${LLAMA3_INPUT_BIN} --llmc_filepath ${LLAMA3_LLMC_FILEPATH} --device cuda ${arg_str}"
-        run_and_log "$llama3_cmd" "llama3_${test_id}${log_suffix}" "$profile_flag"
+            # gpt2
+            gpt2_cmd="${prefix}./gpt2 --input_bin ${GPT2_INPUT_BIN} --llmc_filepath ${GPT2_LLMC_FILEPATH} --device cuda ${arg_str}"
+            run_and_log "$gpt2_cmd" "gpt2_${test_id}${log_suffix}" "$profile_flag" "$group_tag"
+
+            # llama3
+            llama3_cmd="${prefix}./llama3 --input_bin ${LLAMA3_INPUT_BIN} --llmc_filepath ${LLAMA3_LLMC_FILEPATH} --device cuda ${arg_str}"
+            run_and_log "$llama3_cmd" "llama3_${test_id}${log_suffix}" "$profile_flag" "$group_tag"
+        done
     done
 done
 
@@ -202,3 +303,6 @@ else
     echo -e "\033[1;33m         or export COMPARE_LOG_DIR=/path/to/baseline_logs before running.\033[0m"
     echo -e "\033[1;33m============================================================\033[0m"
 fi
+
+echo -e "\n\033[1;36m[END OF TEST] Cleaning build directory after all tests\033[0m"
+clean_build_dir
diff --git a/scripts/test_config.json b/scripts/test_config.json
index 5659b516..38a751ea 100644
--- a/scripts/test_config.json
+++ b/scripts/test_config.json
@@ -21,279 +21,288 @@
             "cmd": "cmake -DUSE_CUDA=ON -DUSE_NCCL=ON -DPROFILE_MODE=ON .. && make -j"
         }
     ],
-    "tests": [
+    "test_groups": [
         {
-            "id": "1",
-            "args": {
-                "dtype": "float32"
-            }
+            "tag": "basic",
+            "tests": [
+                {
+                    "id": "1",
+                    "args": {
+                        "dtype": "float32"
+                    }
+                },
+                {
+                    "id": "1_bfloat16",
+                    "args": {
+                        "dtype": "bfloat16"
+                    }
+                },
+                {
+                    "id": "2",
+                    "args": {
+                        "dtype": "float32",
+                        "num_iteration": 10,
+                        "batch_size": 80,
+                        "total_batch_size": 5120
+                    }
+                },
+                {
+                    "id": "2_bfloat16",
+                    "args": {
+                        "dtype": "bfloat16",
+                        "num_iteration": 10,
+                        "batch_size": 80,
+                        "total_batch_size": 5120
+                    }
+                },
+                {
+                    "id": "3",
+                    "args": {
+                        "dtype": "float32",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 10,
+                        "total_batch_size": 5120
+                    }
+                },
+                {
+                    "id": "3_bfloat16",
+                    "args": {
+                        "dtype": "bfloat16",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 10,
+                        "total_batch_size": 5120
+                    }
+                },
+                {
+                    "id": "4",
+                    "args": {
+                        "dtype": "float32",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 40,
+                        "total_batch_size": 5120,
+                        "tensor_parallel": 4
+                    }
+                },
+                {
+                    "id": "4_bfloat16",
+                    "args": {
+                        "dtype": "bfloat16",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 40,
+                        "total_batch_size": 5120,
+                        "tensor_parallel": 4
+                    }
+                },
+                {
+                    "id": "5",
+                    "args": {
+                        "dtype": "float32",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 40,
+                        "total_batch_size": 5120,
+                        "tensor_parallel": 4,
+                        "sequence_parallel": true
+                    }
+                },
+                {
+                    "id": "5_bfloat16",
+                    "args": {
+                        "dtype": "bfloat16",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 40,
+                        "total_batch_size": 5120,
+                        "tensor_parallel": 4,
+                        "sequence_parallel": true
+                    }
+                },
+                {
+                    "id": "6",
+                    "args": {
+                        "dtype": "float32",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 10,
+                        "total_batch_size": 5120,
+                        "pipeline_parallel": 8
+                    }
+                },
+                {
+                    "id": "6_bfloat16",
+                    "args": {
+                        "dtype": "bfloat16",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 10,
+                        "total_batch_size": 5120,
+                        "pipeline_parallel": 8
+                    }
+                },
+                {
+                    "id": "7",
+                    "args": {
+                        "dtype": "float32",
+                        "nthread_per_process": 4,
+                        "num_iteration": 10,
+                        "batch_size": 10,
+                        "total_batch_size": 5120,
+                        "pipeline_parallel": 4,
+                        "virtual_pipeline_parallel": 2
+                    }
+                },
+                {
+                    "id": "7_bfloat16",
+                    "args": {
+                        "dtype": "bfloat16",
+                        "nthread_per_process": 4,
+                        "num_iteration": 10,
+                        "batch_size": 10,
+                        "total_batch_size": 5120,
+                        "pipeline_parallel": 4,
+                        "virtual_pipeline_parallel": 2
+                    }
+                },
+                {
+                    "id": "8",
+                    "args": {
+                        "dtype": "float32",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 40,
+                        "total_batch_size": 5120,
+                        "tensor_parallel": 2,
+                        "sequence_parallel": true,
+                        "pipeline_parallel": 2,
+                        "virtual_pipeline_parallel": 2
+                    }
+                },
+                {
+                    "id": "8_bfloat16",
+                    "args": {
+                        "dtype": "bfloat16",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 40,
+                        "total_batch_size": 5120,
+                        "tensor_parallel": 2,
+                        "sequence_parallel": true,
+                        "pipeline_parallel": 2,
+                        "virtual_pipeline_parallel": 2
+                    }
+                }
+            ]
         },
         {
-            "id": "1_bfloat16",
-            "args": {
-                "dtype": "bfloat16"
-            }
-        },
-        {
-            "id": "2",
-            "args": {
-                "dtype": "float32",
-                "num_iteration": 10,
-                "batch_size": 80,
-                "total_batch_size": 5120
-            }
-        },
-        {
-            "id": "2_bfloat16",
-            "args": {
-                "dtype": "bfloat16",
-                "num_iteration": 10,
-                "batch_size": 80,
-                "total_batch_size": 5120
-            }
-        },
-        {
-            "id": "3",
-            "args": {
-                "dtype": "float32",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 10,
-                "total_batch_size": 5120
-            }
-        },
-        {
-            "id": "3_distopt",
-            "args": {
-                "dtype": "float32",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 10,
-                "total_batch_size": 5120,
-                "use_distributed_optimizer": true
-            }
-        },
-        {
-            "id": "3_bfloat16",
-            "args": {
-                "dtype": "bfloat16",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 10,
-                "total_batch_size": 5120
-            }
-        },
-        {
-            "id": "3_bfloat16_distopt",
-            "args": {
-                "dtype": "bfloat16",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 10,
-                "total_batch_size": 5120,
-                "use_distributed_optimizer": true
-            }
-        },
-        {
-            "id": "4",
-            "args": {
-                "dtype": "float32",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 40,
-                "total_batch_size": 5120,
-                "tensor_parallel": 4
-            }
-        },
-        {
-            "id": "4_distopt",
-            "args": {
-                "dtype": "float32",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 40,
-                "total_batch_size": 5120,
-                "tensor_parallel": 4,
-                "use_distributed_optimizer": true
-            }
-        },
-        {
-            "id": "4_bfloat16",
-            "args": {
-                "dtype": "bfloat16",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 40,
-                "total_batch_size": 5120,
-                "tensor_parallel": 4
-            }
-        },
-        {
-            "id": "4_bfloat16_distopt",
-            "args": {
-                "dtype": "bfloat16",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 40,
-                "total_batch_size": 5120,
-                "tensor_parallel": 4,
-                "use_distributed_optimizer": true
-            }
-        },
-        {
-            "id": "5",
-            "args": {
-                "dtype": "float32",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 40,
-                "total_batch_size": 5120,
-                "tensor_parallel": 4,
-                "sequence_parallel": true
-            }
-        },
-        {
-            "id": "5_distopt",
-            "args": {
-                "dtype": "float32",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 40,
-                "total_batch_size": 5120,
-                "tensor_parallel": 4,
-                "sequence_parallel": true,
-                "use_distributed_optimizer": true
-            }
-        },
-        {
-            "id": "5_bfloat16",
-            "args": {
-                "dtype": "bfloat16",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 40,
-                "total_batch_size": 5120,
-                "tensor_parallel": 4,
-                "sequence_parallel": true
-            }
-        },
-        {
-            "id": "5_bfloat16_distopt",
-            "args": {
-                "dtype": "bfloat16",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 40,
-                "total_batch_size": 5120,
-                "tensor_parallel": 4,
-                "sequence_parallel": true,
-                "use_distributed_optimizer": true
-            }
-        },
-        {
-            "id": "6",
-            "args": {
-                "dtype": "float32",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 10,
-                "total_batch_size": 5120,
-                "pipeline_parallel": 8
-            }
-        },
-        {
-            "id": "6_bfloat16",
-            "args": {
-                "dtype": "bfloat16",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 10,
-                "total_batch_size": 5120,
-                "pipeline_parallel": 8
-            }
-        },
-        {
-            "id": "7",
-            "args": {
-                "dtype": "float32",
-                "nthread_per_process": 4,
-                "num_iteration": 10,
-                "batch_size": 10,
-                "total_batch_size": 5120,
-                "pipeline_parallel": 4,
-                "virtual_pipeline_parallel": 2
-            }
-        },
-        {
-            "id": "7_bfloat16",
-            "args": {
-                "dtype": "bfloat16",
-                "nthread_per_process": 4,
-                "num_iteration": 10,
-                "batch_size": 10,
-                "total_batch_size": 5120,
-                "pipeline_parallel": 4,
-                "virtual_pipeline_parallel": 2
-            }
-        },
-        {
-            "id": "8",
-            "args": {
-                "dtype": "float32",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 40,
-                "total_batch_size": 5120,
-                "tensor_parallel": 2,
-                "sequence_parallel": true,
-                "pipeline_parallel": 2,
-                "virtual_pipeline_parallel": 2
-            }
-        },
-        {
-            "id": "8_distopt",
-            "args": {
-                "dtype": "float32",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 40,
-                "total_batch_size": 5120,
-                "tensor_parallel": 2,
-                "sequence_parallel": true,
-                "pipeline_parallel": 2,
-                "virtual_pipeline_parallel": 2,
-                "use_distributed_optimizer": true
-            }
-        },
-        {
-            "id": "8_bfloat16",
-            "args": {
-                "dtype": "bfloat16",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 40,
-                "total_batch_size": 5120,
-                "tensor_parallel": 2,
-                "sequence_parallel": true,
-                "pipeline_parallel": 2,
-                "virtual_pipeline_parallel": 2
-            }
-        },
-        {
-            "id": "8_bfloat16_distopt",
-            "args": {
-                "dtype": "bfloat16",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 40,
-                "total_batch_size": 5120,
-                "tensor_parallel": 2,
-                "sequence_parallel": true,
-                "pipeline_parallel": 2,
-                "virtual_pipeline_parallel": 2,
-                "use_distributed_optimizer": true
-            }
+            "tag": "zero",
+            "tests": [
+                {
+                    "id": "3_distopt",
+                    "args": {
+                        "dtype": "float32",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 10,
+                        "total_batch_size": 5120,
+                        "use_distributed_optimizer": true
+                    }
+                },
+                {
+                    "id": "3_bfloat16_distopt",
+                    "args": {
+                        "dtype": "bfloat16",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 10,
+                        "total_batch_size": 5120,
+                        "use_distributed_optimizer": true
+                    }
+                },
+                {
+                    "id": "4_distopt",
+                    "args": {
+                        "dtype": "float32",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 40,
+                        "total_batch_size": 5120,
+                        "tensor_parallel": 4,
+                        "use_distributed_optimizer": true
+                    }
+                },
+                {
+                    "id": "4_bfloat16_distopt",
+                    "args": {
+                        "dtype": "bfloat16",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 40,
+                        "total_batch_size": 5120,
+                        "tensor_parallel": 4,
+                        "use_distributed_optimizer": true
+                    }
+                },
+                {
+                    "id": "5_distopt",
+                    "args": {
+                        "dtype": "float32",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 40,
+                        "total_batch_size": 5120,
+                        "tensor_parallel": 4,
+                        "sequence_parallel": true,
+                        "use_distributed_optimizer": true
+                    }
+                },
+                {
+                    "id": "5_bfloat16_distopt",
+                    "args": {
+                        "dtype": "bfloat16",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 40,
+                        "total_batch_size": 5120,
+                        "tensor_parallel": 4,
+                        "sequence_parallel": true,
+                        "use_distributed_optimizer": true
+                    }
+                },
+                {
+                    "id": "8_distopt",
+                    "args": {
+                        "dtype": "float32",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 40,
+                        "total_batch_size": 5120,
+                        "tensor_parallel": 2,
+                        "sequence_parallel": true,
+                        "pipeline_parallel": 2,
+                        "virtual_pipeline_parallel": 2,
+                        "use_distributed_optimizer": true
+                    }
+                },
+                {
+                    "id": "8_bfloat16_distopt",
+                    "args": {
+                        "dtype": "bfloat16",
+                        "nthread_per_process": 8,
+                        "num_iteration": 10,
+                        "batch_size": 40,
+                        "total_batch_size": 5120,
+                        "tensor_parallel": 2,
+                        "sequence_parallel": true,
+                        "pipeline_parallel": 2,
+                        "virtual_pipeline_parallel": 2,
+                        "use_distributed_optimizer": true
+                    }
+                }
+            ]
         }
     ]
 }
-
diff --git a/scripts/write_to_feishu_sheet.py b/scripts/write_to_feishu_sheet.py
index 28c41661..670e9012 100644
--- a/scripts/write_to_feishu_sheet.py
+++ b/scripts/write_to_feishu_sheet.py
@@ -2,6 +2,7 @@
 import json
 import time
 import os
+import sys
 import argparse
 import glob
 import re
@@ -14,6 +15,10 @@
 HEADER_ROWS=5
 HEADER_COLS="W"
 
+# Retry settings
+REQUEST_RETRY_TIMES=3
+REQUEST_RETRY_DELAY=10
+
 class FeishuSheetHandler:
     """Feishu Sheet Handler for retrieving and writing sheet data"""
 
@@ -25,13 +30,36 @@ def __init__(self, app_id, app_secret):
         self.token_expire_time = 0
         self.get_access_token()
 
+    def _request_with_timeout_retry(self, request_func, request_name):
+        """Retry request when ReadTimeout happens."""
+        for attempt in range(REQUEST_RETRY_TIMES):
+            try:
+                return request_func()
+            except requests.exceptions.ReadTimeout:
+                if attempt == REQUEST_RETRY_TIMES - 1:
+                    print(
+                        f"FATAL: HTTP timeout after {REQUEST_RETRY_TIMES} attempts while handling "
+                        f"{request_name}. Please manually revert the Feishu sheet to a previous version."
+                    )
+                    sys.exit(1)
+                print(
+                    f"{request_name} timed out on attempt "
+                    f"{attempt + 1}/{REQUEST_RETRY_TIMES}, retry after {REQUEST_RETRY_DELAY}s"
+                )
+                time.sleep(REQUEST_RETRY_DELAY)
+
     def get_access_token(self):
         """Get and cache tenant_access_token"""
         if self.access_token and time.time() < self.token_expire_time:
             return self.access_token
 
         url = f"{self.base_url}/auth/v3/tenant_access_token/internal"
-        resp = requests.post(url, json={"app_id": self.app_id, "app_secret": self.app_secret}, timeout=10)
+        resp = self._request_with_timeout_retry(
+            lambda: requests.post(url, json={"app_id": self.app_id, "app_secret": self.app_secret}, timeout=10),
+            "Get access token"
+        )
+        if resp is None:
+            return None
         if resp.status_code != 200:
             print("Failed to get token: HTTP error", resp.status_code)
             return None
@@ -57,7 +85,12 @@ def _feishu_request(self, method, endpoint, **kwargs):
         }
 
         url = f"{self.base_url}{endpoint}"
-        resp = requests.request(method, url, headers=headers, timeout=15, **kwargs)
+        resp = self._request_with_timeout_retry(
+            lambda: requests.request(method, url, headers=headers, timeout=15, **kwargs),
+            f"{method} {endpoint}"
+        )
+        if resp is None:
+            return None
 
         if resp.status_code != 200:
             print(f"Request failed: HTTP {resp.status_code}")
@@ -226,6 +259,47 @@ def convert_to_feishu_date(dt):
         return (dt - base_date).days + 2
 
 
+def normalize_tag_spreadsheet_configs(config):
+    """Normalize config into a list of tag-specific spreadsheet mappings."""
+    tag_configs = config.get("TAG_SPREADSHEET_CONFIGS")
+    if tag_configs is not None:
+        if not isinstance(tag_configs, list) or not tag_configs:
+            print("TAG_SPREADSHEET_CONFIGS must be a non-empty list")
+            return None
+
+        normalized = []
+        for item in tag_configs:
+            if not isinstance(item, dict):
+                print("Each TAG_SPREADSHEET_CONFIGS item must be a JSON object")
+                return None
+
+            tag = item.get("tag")
+            model_tokens = item.get("MODEL_SPREADSHEET_TOKEN")
+            if not tag:
+                print("Each TAG_SPREADSHEET_CONFIGS item must contain a non-empty tag")
+                return None
+            if not isinstance(model_tokens, dict) or not model_tokens:
+                print(f"MODEL_SPREADSHEET_TOKEN for tag={tag} must be a non-empty dictionary")
+                return None
+
+            normalized.append({
+                "tag": tag,
+                "MODEL_SPREADSHEET_TOKEN": model_tokens
+            })
+
+        return normalized
+
+    legacy_tokens = config.get("MODEL_SPREADSHEET_TOKEN")
+    if isinstance(legacy_tokens, dict) and legacy_tokens:
+        return [{
+            "tag": "basic",
+            "MODEL_SPREADSHEET_TOKEN": legacy_tokens
+        }]
+
+    print("Config file must contain TAG_SPREADSHEET_CONFIGS or MODEL_SPREADSHEET_TOKEN")
+    return None
+
+
 def load_config(config_file):
     """Load configuration from JSON file"""
     if not os.path.exists(config_file):
@@ -239,16 +313,17 @@ def load_config(config_file):
         print(f"Config file {config_file} is not valid JSON file")
         return None
 
-    required_keys = ["APP_ID", "APP_SECRET", "MODEL_SPREADSHEET_TOKEN"]
+    required_keys = ["APP_ID", "APP_SECRET"]
     for key in required_keys:
         if key not in config:
             print(f"Config file missing required key: {key}")
             return None
 
-    if not isinstance(config["MODEL_SPREADSHEET_TOKEN"], dict) or not config["MODEL_SPREADSHEET_TOKEN"]:
-        print("MODEL_SPREADSHEET_TOKEN must be a non-empty dictionary")
+    tag_configs = normalize_tag_spreadsheet_configs(config)
+    if not tag_configs:
         return None
 
+    config["TAG_SPREADSHEET_CONFIGS"] = tag_configs
     return config
 
 def parse_command_args(log_content: str, start_flag="--dtype"):
@@ -376,9 +451,9 @@ def parse_profile_report(profile_content):
         return merged_df.head(5).iloc[:, :16]
     return None
 
-def discover_testcases(model_name: str, log_dir="logs"):
+def discover_testcases(model_name: str, tag: str, log_dir="logs"):
     """Get all test case id from local log dir"""
-    pattern = os.path.join(log_dir, f"{model_name}_*.log")
+    pattern = os.path.join(log_dir, tag, f"{model_name}_*.log")
     files = glob.glob(pattern)
     testcases = []
     prefix = f"{model_name}_"
@@ -409,12 +484,13 @@ def get_git_commit_id():
         return "unknown"
 
 
-def get_model_data(model_name, sheet_title):
+def get_model_data(model_name, sheet_title, tag, log_dir="logs", profile_log_dir="profile_logs"):
     """Construct 2D list for writing to Feishu"""
-    log_file_path = f"logs/{model_name}_{sheet_title}.log"
-    profile_file_path = f"profile_logs/{model_name}_{sheet_title}_profile_{model_name}.report.rank0"
+    log_file_path = os.path.join(log_dir, tag, f"{model_name}_{sheet_title}.log")
+    profile_file_path = os.path.join(profile_log_dir, tag, f"{model_name}_{sheet_title}_profile_{model_name}.report.rank0")
 
     avg_latency, avg_throughput, peak_used_max, peak_reserved_max = None, None, None, None
+    cmd_args = None
 
     # Read training log
     if os.path.exists(log_file_path):
@@ -436,7 +512,7 @@ def get_model_data(model_name, sheet_title):
         print(f"Performance report does not exist: {profile_file_path}")
 
     if report_df is None:
-        return []
+        return cmd_args, []
 
     # Insert $META_COLS empty columns at the front
     new_data = [["" for _ in range(META_COLS)] for _ in range(5)]
@@ -470,65 +546,69 @@ def main():
         return
 
     print(f"Successfully loaded config file: {args.config_file}")
-    print(f"Found {len(config['MODEL_SPREADSHEET_TOKEN'])} models to process")
+    print(f"Found {len(config['TAG_SPREADSHEET_CONFIGS'])} tag configs to process")
 
     handler = FeishuSheetHandler(
         app_id=config["APP_ID"],
         app_secret=config["APP_SECRET"]
     )
 
-    for model_name, spreadsheet_token in config["MODEL_SPREADSHEET_TOKEN"].items():
-        print(f"\n=== Start processing {model_name} ===")
-        model_name = model_name.lower()
+    for tag_config in config["TAG_SPREADSHEET_CONFIGS"]:
+        tag = tag_config["tag"]
+        print(f"\n=== Start processing tag={tag} ===")
 
-        testcases = discover_testcases(model_name)
-        if not testcases:
-            print(f"No local testcases found under logs/ for model={model_name}, skipping")
-            continue
-        print(f"Discovered {len(testcases)} local testcases: {testcases}")
+        for model_name, spreadsheet_token in tag_config["MODEL_SPREADSHEET_TOKEN"].items():
+            print(f"\n--- Processing model={model_name} tag={tag} ---")
+            model_name = model_name.lower()
 
-        remote_sheets = handler.get_all_sheet_ids(spreadsheet_token)
-        remote_by_title = {s["title"]: s["sheet_id"] for s in remote_sheets}
+            testcases = discover_testcases(model_name, tag)
+            if not testcases:
+                print(f"No local testcases found under logs/{tag}/ for model={model_name}, skipping")
+                continue
+            print(f"Discovered {len(testcases)} local testcases: {testcases}")
 
-        if "模板" not in remote_by_title:
-            print(f"No template sheets retrieved for {model_name}, skipping")
-            continue
-        template_sheet_id = remote_by_title["模板"]
+            remote_sheets = handler.get_all_sheet_ids(spreadsheet_token)
+            remote_by_title = {s["title"]: s["sheet_id"] for s in remote_sheets}
+
+            if "模板" not in remote_by_title:
+                print(f"No template sheets retrieved for model={model_name}, tag={tag}, skipping")
+                continue
+            template_sheet_id = remote_by_title["模板"]
 
-        sort_sheets = False
+            sort_sheets = False
 
-        for testcase in testcases:
-            print("\n-------")
-            sheet_id = remote_by_title.get(testcase)
-            write_cmd = False
+            for testcase in testcases:
+                print("\n-------")
+                sheet_id = remote_by_title.get(testcase)
+                write_cmd = False
 
-            if not sheet_id:
-                print(f"Sheet for '{testcase}' not found, creating from template...")
-                sheet_id = handler.create_sheet_for_testcase(spreadsheet_token, sheet_title=testcase, template_sheet_id=template_sheet_id)
                 if not sheet_id:
-                    print(f"Failed to create sheet '{testcase}', skipping")
+                    print(f"Sheet for '{testcase}' not found, creating from template...")
+                    sheet_id = handler.create_sheet_for_testcase(spreadsheet_token, sheet_title=testcase, template_sheet_id=template_sheet_id)
+                    if not sheet_id:
+                        print(f"Failed to create sheet '{testcase}', skipping")
+                        continue
+                    remote_by_title[testcase] = sheet_id
+                    sort_sheets = True
+                    write_cmd = True
+                    print(f"Created sheet '{testcase}' with id={sheet_id}")
+
+                print(f"Processing testcase '{testcase}' -> sheet_id={sheet_id}")
+
+                cmd_args, sheet_data = get_model_data(model_name=model_name, sheet_title=testcase, tag=tag)
+
+                if not sheet_data:
+                    print("No valid data generated, skipping")
                     continue
-                remote_by_title[testcase] = sheet_id
-                sort_sheets = True
-                write_cmd = True
-                print(f"Created sheet '{testcase}' with id={sheet_id}")
-            
-            print(f"Processing testcase '{testcase}' -> sheet_id={sheet_id}")
-
-            cmd_args, sheet_data = get_model_data(model_name=model_name, sheet_title=testcase)
-
-            if not sheet_data:
-                print("No valid data generated, skipping")
-                continue
 
-            if write_cmd and cmd_args:
-                handler.write_cmd_args_to_header(spreadsheet_token, cmd_args, sheet_id)
+                if write_cmd and cmd_args:
+                    handler.write_cmd_args_to_header(spreadsheet_token, cmd_args, sheet_id)
 
-            if handler.prepend_data(spreadsheet_token, sheet_id, sheet_data):
-                handler.post_process(spreadsheet_token, sheet_id)
+                if handler.prepend_data(spreadsheet_token, sheet_id, sheet_data):
+                    handler.post_process(spreadsheet_token, sheet_id)
 
-        if sort_sheets:
-            handler.sort_sheets_by_title(spreadsheet_token, "模板")
+            if sort_sheets:
+                handler.sort_sheets_by_title(spreadsheet_token, "模板")
 
     print("\n=== All models and sheets processed ===")