InfiniTensor · kilinchange · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026
diff --git a/scripts/compare_loss.py b/scripts/compare_loss.py
@@ -9,6 +9,7 @@
 import sys
 from pathlib import Path
 from argparse import ArgumentParser
+from compare_utils import collect_log_files, exit_if_duplicate_logs
 
 def get_dtype_from_filename(filename):
     """Determine dtype from filename. Returns 'bfloat16' or 'fp32'."""
@@ -62,8 +63,10 @@ def main():
         args.threshold_fp32 = args.threshold
         args.threshold_bf16 = args.threshold
 
-    files1 = {f.name: f for f in args.dir1.glob('*.log') if not f.name.startswith('build')}
-    files2 = {f.name: f for f in args.dir2.glob('*.log') if not f.name.startswith('build')}
+    files1, duplicates1 = collect_log_files(args.dir1)
+    files2, duplicates2 = collect_log_files(args.dir2)
+    exit_if_duplicate_logs(args.dir1, duplicates1)
+    exit_if_duplicate_logs(args.dir2, duplicates2)
 
     only_in_1 = set(files1.keys()) - set(files2.keys())
     only_in_2 = set(files2.keys()) - set(files1.keys())

diff --git a/scripts/compare_tps.py b/scripts/compare_tps.py
@@ -9,6 +9,7 @@
 import sys
 from pathlib import Path
 from argparse import ArgumentParser
+from compare_utils import collect_log_files, exit_if_duplicate_logs
 
 def parse_log(file_path):
     """Extract step -> tok/s mapping from log file."""
@@ -55,8 +56,10 @@ def main():
     parser.add_argument('--verbose', action='store_true', help='Print detailed output for all files, including passed ones')
     args = parser.parse_args()
 
-    files1 = {f.name: f for f in args.dir1.glob('*.log') if not f.name.startswith('build')}
-    files2 = {f.name: f for f in args.dir2.glob('*.log') if not f.name.startswith('build')}
+    files1, duplicates1 = collect_log_files(args.dir1)
+    files2, duplicates2 = collect_log_files(args.dir2)
+    exit_if_duplicate_logs(args.dir1, duplicates1)
+    exit_if_duplicate_logs(args.dir2, duplicates2)
 
     only_in_1 = set(files1.keys()) - set(files2.keys())
     only_in_2 = set(files2.keys()) - set(files1.keys())

diff --git a/scripts/compare_utils.py b/scripts/compare_utils.py
@@ -0,0 +1,31 @@
+from pathlib import Path
+import sys
+
+
+def collect_log_files(base_dir: Path):
+    """Collect comparable training logs keyed by basename."""
+    files = {}
+    duplicates = {}
+
+    for path in base_dir.rglob("*.log"):
+        if path.name.startswith("build") or path.name.endswith("_profile.log"):
+            continue
+
+        key = path.name
+        if key in files:
+            duplicates.setdefault(key, [files[key]]).append(path)
+            continue
+        files[key] = path
+
+    return files, duplicates
+
+
+def exit_if_duplicate_logs(base_dir: Path, duplicates):
+    """Abort when duplicate basenames make comparison ambiguous."""
+    if not duplicates:
+        return
+
+    print(f"Found duplicate log basenames in {base_dir.resolve()}, cannot compare safely:")
+    for name, paths in sorted(duplicates.items()):
+        print(f"  {name}: {', '.join(str(p.relative_to(base_dir)) for p in paths)}")
+    sys.exit(1)
diff --git a/scripts/run_models_and_profile.bash b/scripts/run_models_and_profile.bash
@@ -3,7 +3,56 @@
 set -e
 set -o pipefail
 
-CONFIG_FILE="${1:-test_config.json}"
+usage() {
+    cat <<'EOF'
+Usage: run_models_and_profile.bash [--test-config path] [--only-run tag1,tag2]
+
+Options:
+  --test-config PATH  Path to test config JSON. Default: test_config.json.
+  --only-run TAGS   Only run the specified tag groups, separated by commas.
+  -h, --help        Show this help message.
+EOF
+}
+
+CONFIG_FILE="test_config.json"
+ONLY_RUN_TAGS=""
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --test-config)
+            [[ $# -lt 2 ]] && { echo "Error: --test-config requires a file path."; exit 1; }
+            CONFIG_FILE="$2"
+            shift 2
+            ;;
+        --test-config=*)
+            CONFIG_FILE="${1#*=}"
+            shift
+            ;;
+        --only-run)
+            [[ $# -lt 2 ]] && { echo "Error: --only-run requires a comma-separated tag list."; exit 1; }
+            ONLY_RUN_TAGS="$2"
+            shift 2
+            ;;
+        --only-run=*)
+            ONLY_RUN_TAGS="${1#*=}"
+            shift
+            ;;
+        -h|--help)
+            usage
+            exit 0
+            ;;
+        -*)
+            echo "Error: Unknown option: $1"
+            usage
+            exit 1
+            ;;
+        *)
+            echo "Error: Unknown positional argument: $1"
+            usage
+            exit 1
+            ;;
+    esac
+done
 
 # Dependencies check
 if ! command -v jq >/dev/null 2>&1; then
@@ -33,6 +82,28 @@ done < <(jq -r '.variables | to_entries[] | "\(.key)=\(.value)"' "$CONFIG_FILE")
 
 # Global variable to save the last cmake command
 LAST_CMAKE_CMD=""
+declare -A SELECTED_TAGS=()
+
+normalize_tag() {
+    local raw="$1"
+    raw="${raw#"${raw%%[![:space:]]*}"}"
+    raw="${raw%"${raw##*[![:space:]]}"}"
+    printf '%s' "$raw"
+}
+
+if [[ -n "$ONLY_RUN_TAGS" ]]; then
+    IFS=',' read -r -a requested_tags <<< "$ONLY_RUN_TAGS"
+    for raw_tag in "${requested_tags[@]}"; do
+        tag="$(normalize_tag "$raw_tag")"
+        [[ -z "$tag" ]] && continue
+        SELECTED_TAGS["$tag"]=1
+    done
+
+    if [[ ${#SELECTED_TAGS[@]} -eq 0 ]]; then
+        echo "Error: --only-run did not contain any valid tags."
+        exit 1
+    fi
+fi
 
 # Clean the build directory
 clean_build_dir() {
@@ -46,9 +117,12 @@ run_and_log() {
     local cmd="$1"
     local log_name="$2"
     local is_profile="$3"
+    local tag="${4:-basic}"
     local timestamp
     timestamp=$(date '+%Y-%m-%d %H:%M:%S')
-    local log_path="$(realpath "${LOG_DIR}/${log_name}.log")"
+    local tag_log_dir="${LOG_DIR}/${tag}"
+    mkdir -p "$tag_log_dir"
+    local log_path="$(realpath "${tag_log_dir}/${log_name}.log")"
 
     echo -e "\033[1;32m============================================================\033[0m"
     echo -e "\033[1;36m[$timestamp] [Running] ${log_name}\033[0m"
@@ -99,22 +173,25 @@ run_and_log() {
 
     # If profiling is enabled, move profiling files to the target directory
     if [[ "$is_profile" == "yes" ]]; then
-        move_profile_logs "$log_name"
+        move_profile_logs "$log_name" "$tag"
     fi
 }
 
 
 # Move profiling output logs
 move_profile_logs() {
     local prefix="$1"
+    local tag="${2:-basic}"
+    local tag_profile_dir="${PROFILE_LOG_DIR}/${tag}"
+    mkdir -p "$tag_profile_dir"
 
     # Move *.report.rankN files
     for report_file in "${BUILD_DIR}"/*.report.rank*; do
         if [[ -f "$report_file" ]]; then
             local base_name
             base_name=$(basename "$report_file")
-            mv "$report_file" "${PROFILE_LOG_DIR}/${prefix}_${base_name}"
-            echo "Moved $base_name to ${PROFILE_LOG_DIR}/${prefix}_${base_name}"
+            mv "$report_file" "${tag_profile_dir}/${prefix}_${base_name}"
+            echo "Moved $base_name to ${tag_profile_dir}/${prefix}_${base_name}"
         fi
     done
 
@@ -123,25 +200,39 @@ move_profile_logs() {
         if [[ -f "$record_file" ]]; then
             local base_name
             base_name=$(basename "$record_file")
-            mv "$record_file" "${PROFILE_LOG_DIR}/${prefix}_${base_name}"
-            echo "Moved $base_name to ${PROFILE_LOG_DIR}/${prefix}_${base_name}"
+            mv "$record_file" "${tag_profile_dir}/${prefix}_${base_name}"
+            echo "Moved $base_name to ${tag_profile_dir}/${prefix}_${base_name}"
         fi
     done
 }
 
-# Build "--key value" arg string from tests[i].args (shell-escaped)
+# Build "--key value" arg string from test_groups[gi].tests[ti].args (shell-escaped)
 args_string_for_test() {
-    local idx="$1"
-    jq -r --argjson i "$idx" '
-      .tests[$i].args
+    local group_idx="$1"
+    local test_idx="$2"
+    jq -r --argjson g "$group_idx" --argjson t "$test_idx" '
+      .test_groups[$g].tests[$t].args
       | to_entries[]
       | "--\(.key) \(.value|tostring)"
     ' "$CONFIG_FILE" | paste -sd' ' -
 }
 
 # Run tests
 num_builds=$(jq '.builds | length' "$CONFIG_FILE")
-num_tests=$(jq '.tests  | length' "$CONFIG_FILE")
+num_groups=$(jq '.test_groups | length' "$CONFIG_FILE")
+
+selected_group_count=0
+for ((gi=0; gi<num_groups; ++gi)); do
+    group_tag=$(jq -r ".test_groups[$gi].tag" "$CONFIG_FILE")
+    if [[ ${#SELECTED_TAGS[@]} -eq 0 || -n "${SELECTED_TAGS[$group_tag]}" ]]; then
+        ((selected_group_count += 1))
+    fi
+done
+
+if [[ "$selected_group_count" -eq 0 ]]; then
+    echo "Error: No matching test groups found for --only-run=${ONLY_RUN_TAGS}"
+    exit 1
+fi
 
 for ((id=0; id<num_builds; ++id)); do
     build_id=$(jq -r ".builds[$id].id" "$CONFIG_FILE")
@@ -152,7 +243,7 @@ for ((id=0; id<num_builds; ++id)); do
 
     # always clean before another build
     clean_build_dir
-    run_and_log "$LAST_CMAKE_CMD" "${build_id}" "no"
+    run_and_log "$LAST_CMAKE_CMD" "${build_id}" "no" "build"
 
     # profile flag for runs
     profile_flag="no"
@@ -162,17 +253,27 @@ for ((id=0; id<num_builds; ++id)); do
         log_suffix="_profile"
     fi
 
-    for ((ti=0; ti<num_tests; ++ti)); do
-        test_id=$(jq -r ".tests[$ti].id" "$CONFIG_FILE")
-        arg_str="$(args_string_for_test "$ti")"
+    for ((gi=0; gi<num_groups; ++gi)); do
+        group_tag=$(jq -r ".test_groups[$gi].tag" "$CONFIG_FILE")
+        if [[ ${#SELECTED_TAGS[@]} -gt 0 && -z "${SELECTED_TAGS[$group_tag]}" ]]; then
+            continue
+        fi
+
+        num_tests=$(jq ".test_groups[$gi].tests | length" "$CONFIG_FILE")
+        echo -e "\033[1;36m[TEST GROUP] tag=${group_tag}, cases=${num_tests}\033[0m"
 
-        # gpt2
-        gpt2_cmd="${prefix}./gpt2 --input_bin ${GPT2_INPUT_BIN} --llmc_filepath ${GPT2_LLMC_FILEPATH} --device cuda ${arg_str}"
-        run_and_log "$gpt2_cmd" "gpt2_${test_id}${log_suffix}" "$profile_flag"
+        for ((ti=0; ti<num_tests; ++ti)); do
+            test_id=$(jq -r ".test_groups[$gi].tests[$ti].id" "$CONFIG_FILE")
+            arg_str="$(args_string_for_test "$gi" "$ti")"
 
-        # llama3
-        llama3_cmd="${prefix}./llama3 --input_bin ${LLAMA3_INPUT_BIN} --llmc_filepath ${LLAMA3_LLMC_FILEPATH} --device cuda ${arg_str}"
-        run_and_log "$llama3_cmd" "llama3_${test_id}${log_suffix}" "$profile_flag"
+            # gpt2
+            gpt2_cmd="${prefix}./gpt2 --input_bin ${GPT2_INPUT_BIN} --llmc_filepath ${GPT2_LLMC_FILEPATH} --device cuda ${arg_str}"
+            run_and_log "$gpt2_cmd" "gpt2_${test_id}${log_suffix}" "$profile_flag" "$group_tag"
+
+            # llama3
+            llama3_cmd="${prefix}./llama3 --input_bin ${LLAMA3_INPUT_BIN} --llmc_filepath ${LLAMA3_LLMC_FILEPATH} --device cuda ${arg_str}"
+            run_and_log "$llama3_cmd" "llama3_${test_id}${log_suffix}" "$profile_flag" "$group_tag"
+        done
     done
 done
 
@@ -202,3 +303,6 @@ else
     echo -e "\033[1;33m         or export COMPARE_LOG_DIR=/path/to/baseline_logs before running.\033[0m"
     echo -e "\033[1;33m============================================================\033[0m"
 fi
+
+echo -e "\n\033[1;36m[END OF TEST] Cleaning build directory after all tests\033[0m"
+clean_build_dir