Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions scripts/compare_loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import sys
from pathlib import Path
from argparse import ArgumentParser
from compare_utils import collect_log_files, exit_if_duplicate_logs

def get_dtype_from_filename(filename):
"""Determine dtype from filename. Returns 'bfloat16' or 'fp32'."""
Expand Down Expand Up @@ -62,8 +63,10 @@ def main():
args.threshold_fp32 = args.threshold
args.threshold_bf16 = args.threshold

files1 = {f.name: f for f in args.dir1.glob('*.log') if not f.name.startswith('build')}
files2 = {f.name: f for f in args.dir2.glob('*.log') if not f.name.startswith('build')}
files1, duplicates1 = collect_log_files(args.dir1)
files2, duplicates2 = collect_log_files(args.dir2)
exit_if_duplicate_logs(args.dir1, duplicates1)
exit_if_duplicate_logs(args.dir2, duplicates2)

only_in_1 = set(files1.keys()) - set(files2.keys())
only_in_2 = set(files2.keys()) - set(files1.keys())
Expand Down
7 changes: 5 additions & 2 deletions scripts/compare_tps.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import sys
from pathlib import Path
from argparse import ArgumentParser
from compare_utils import collect_log_files, exit_if_duplicate_logs

def parse_log(file_path):
"""Extract step -> tok/s mapping from log file."""
Expand Down Expand Up @@ -55,8 +56,10 @@ def main():
parser.add_argument('--verbose', action='store_true', help='Print detailed output for all files, including passed ones')
args = parser.parse_args()

files1 = {f.name: f for f in args.dir1.glob('*.log') if not f.name.startswith('build')}
files2 = {f.name: f for f in args.dir2.glob('*.log') if not f.name.startswith('build')}
files1, duplicates1 = collect_log_files(args.dir1)
files2, duplicates2 = collect_log_files(args.dir2)
exit_if_duplicate_logs(args.dir1, duplicates1)
exit_if_duplicate_logs(args.dir2, duplicates2)

only_in_1 = set(files1.keys()) - set(files2.keys())
only_in_2 = set(files2.keys()) - set(files1.keys())
Expand Down
31 changes: 31 additions & 0 deletions scripts/compare_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from pathlib import Path
import sys


def collect_log_files(base_dir: Path):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个是在 log 目录下面,递归收集可比较日志,不看 tag,只按 basename 索引

"""Collect comparable training logs keyed by basename."""
files = {}
duplicates = {}

for path in base_dir.rglob("*.log"):
if path.name.startswith("build") or path.name.endswith("_profile.log"):
continue

key = path.name
if key in files:
duplicates.setdefault(key, [files[key]]).append(path)
continue
files[key] = path

return files, duplicates


def exit_if_duplicate_logs(base_dir: Path, duplicates):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

由于只按 basename 索引,所以要求了不同 tag 下面的测例不能重名(我们目前也做到了这一点)

"""Abort when duplicate basenames make comparison ambiguous."""
if not duplicates:
return

print(f"Found duplicate log basenames in {base_dir.resolve()}, cannot compare safely:")
for name, paths in sorted(duplicates.items()):
print(f" {name}: {', '.join(str(p.relative_to(base_dir)) for p in paths)}")
sys.exit(1)
148 changes: 126 additions & 22 deletions scripts/run_models_and_profile.bash
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,56 @@
set -e
set -o pipefail

CONFIG_FILE="${1:-test_config.json}"
usage() {
cat <<'EOF'
Usage: run_models_and_profile.bash [--test-config path] [--only-run tag1,tag2]

Options:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

加了两个参数

--test-config PATH Path to test config JSON. Default: test_config.json.
--only-run TAGS Only run the specified tag groups, separated by commas.
-h, --help Show this help message.
EOF
}

CONFIG_FILE="test_config.json"
ONLY_RUN_TAGS=""

while [[ $# -gt 0 ]]; do
case "$1" in
--test-config)
[[ $# -lt 2 ]] && { echo "Error: --test-config requires a file path."; exit 1; }
CONFIG_FILE="$2"
shift 2
;;
--test-config=*)
CONFIG_FILE="${1#*=}"
shift
;;
--only-run)
[[ $# -lt 2 ]] && { echo "Error: --only-run requires a comma-separated tag list."; exit 1; }
ONLY_RUN_TAGS="$2"
shift 2
;;
--only-run=*)
ONLY_RUN_TAGS="${1#*=}"
shift
;;
-h|--help)
usage
exit 0
;;
-*)
echo "Error: Unknown option: $1"
usage
exit 1
;;
*)
echo "Error: Unknown positional argument: $1"
usage
exit 1
;;
esac
done

# Dependencies check
if ! command -v jq >/dev/null 2>&1; then
Expand Down Expand Up @@ -33,6 +82,28 @@ done < <(jq -r '.variables | to_entries[] | "\(.key)=\(.value)"' "$CONFIG_FILE")

# Global variable to save the last cmake command
LAST_CMAKE_CMD=""
declare -A SELECTED_TAGS=()

normalize_tag() {
local raw="$1"
raw="${raw#"${raw%%[![:space:]]*}"}"
raw="${raw%"${raw##*[![:space:]]}"}"
printf '%s' "$raw"
}

if [[ -n "$ONLY_RUN_TAGS" ]]; then
IFS=',' read -r -a requested_tags <<< "$ONLY_RUN_TAGS"
for raw_tag in "${requested_tags[@]}"; do
tag="$(normalize_tag "$raw_tag")"
[[ -z "$tag" ]] && continue
SELECTED_TAGS["$tag"]=1
done

if [[ ${#SELECTED_TAGS[@]} -eq 0 ]]; then
echo "Error: --only-run did not contain any valid tags."
exit 1
fi
fi

# Clean the build directory
clean_build_dir() {
Expand All @@ -46,9 +117,12 @@ run_and_log() {
local cmd="$1"
local log_name="$2"
local is_profile="$3"
local tag="${4:-basic}"
local timestamp
timestamp=$(date '+%Y-%m-%d %H:%M:%S')
local log_path="$(realpath "${LOG_DIR}/${log_name}.log")"
local tag_log_dir="${LOG_DIR}/${tag}"
mkdir -p "$tag_log_dir"
local log_path="$(realpath "${tag_log_dir}/${log_name}.log")"

echo -e "\033[1;32m============================================================\033[0m"
echo -e "\033[1;36m[$timestamp] [Running] ${log_name}\033[0m"
Expand Down Expand Up @@ -99,22 +173,25 @@ run_and_log() {

# If profiling is enabled, move profiling files to the target directory
if [[ "$is_profile" == "yes" ]]; then
move_profile_logs "$log_name"
move_profile_logs "$log_name" "$tag"
fi
}


# Move profiling output logs
move_profile_logs() {
local prefix="$1"
local tag="${2:-basic}"
local tag_profile_dir="${PROFILE_LOG_DIR}/${tag}"
mkdir -p "$tag_profile_dir"

# Move *.report.rankN files
for report_file in "${BUILD_DIR}"/*.report.rank*; do
if [[ -f "$report_file" ]]; then
local base_name
base_name=$(basename "$report_file")
mv "$report_file" "${PROFILE_LOG_DIR}/${prefix}_${base_name}"
echo "Moved $base_name to ${PROFILE_LOG_DIR}/${prefix}_${base_name}"
mv "$report_file" "${tag_profile_dir}/${prefix}_${base_name}"
echo "Moved $base_name to ${tag_profile_dir}/${prefix}_${base_name}"
fi
done

Expand All @@ -123,25 +200,39 @@ move_profile_logs() {
if [[ -f "$record_file" ]]; then
local base_name
base_name=$(basename "$record_file")
mv "$record_file" "${PROFILE_LOG_DIR}/${prefix}_${base_name}"
echo "Moved $base_name to ${PROFILE_LOG_DIR}/${prefix}_${base_name}"
mv "$record_file" "${tag_profile_dir}/${prefix}_${base_name}"
echo "Moved $base_name to ${tag_profile_dir}/${prefix}_${base_name}"
fi
done
}

# Build "--key value" arg string from tests[i].args (shell-escaped)
# Build "--key value" arg string from test_groups[gi].tests[ti].args (shell-escaped)
args_string_for_test() {
local idx="$1"
jq -r --argjson i "$idx" '
.tests[$i].args
local group_idx="$1"
local test_idx="$2"
jq -r --argjson g "$group_idx" --argjson t "$test_idx" '
.test_groups[$g].tests[$t].args
| to_entries[]
| "--\(.key) \(.value|tostring)"
' "$CONFIG_FILE" | paste -sd' ' -
}

# Run tests
num_builds=$(jq '.builds | length' "$CONFIG_FILE")
num_tests=$(jq '.tests | length' "$CONFIG_FILE")
num_groups=$(jq '.test_groups | length' "$CONFIG_FILE")

selected_group_count=0
for ((gi=0; gi<num_groups; ++gi)); do
group_tag=$(jq -r ".test_groups[$gi].tag" "$CONFIG_FILE")
if [[ ${#SELECTED_TAGS[@]} -eq 0 || -n "${SELECTED_TAGS[$group_tag]}" ]]; then
((selected_group_count += 1))
fi
done

if [[ "$selected_group_count" -eq 0 ]]; then
echo "Error: No matching test groups found for --only-run=${ONLY_RUN_TAGS}"
exit 1
fi

for ((id=0; id<num_builds; ++id)); do
build_id=$(jq -r ".builds[$id].id" "$CONFIG_FILE")
Expand All @@ -152,7 +243,7 @@ for ((id=0; id<num_builds; ++id)); do

# always clean before another build
clean_build_dir
run_and_log "$LAST_CMAKE_CMD" "${build_id}" "no"
run_and_log "$LAST_CMAKE_CMD" "${build_id}" "no" "build"

# profile flag for runs
profile_flag="no"
Expand All @@ -162,17 +253,27 @@ for ((id=0; id<num_builds; ++id)); do
log_suffix="_profile"
fi

for ((ti=0; ti<num_tests; ++ti)); do
test_id=$(jq -r ".tests[$ti].id" "$CONFIG_FILE")
arg_str="$(args_string_for_test "$ti")"
for ((gi=0; gi<num_groups; ++gi)); do
group_tag=$(jq -r ".test_groups[$gi].tag" "$CONFIG_FILE")
if [[ ${#SELECTED_TAGS[@]} -gt 0 && -z "${SELECTED_TAGS[$group_tag]}" ]]; then
continue
fi

num_tests=$(jq ".test_groups[$gi].tests | length" "$CONFIG_FILE")
echo -e "\033[1;36m[TEST GROUP] tag=${group_tag}, cases=${num_tests}\033[0m"

# gpt2
gpt2_cmd="${prefix}./gpt2 --input_bin ${GPT2_INPUT_BIN} --llmc_filepath ${GPT2_LLMC_FILEPATH} --device cuda ${arg_str}"
run_and_log "$gpt2_cmd" "gpt2_${test_id}${log_suffix}" "$profile_flag"
for ((ti=0; ti<num_tests; ++ti)); do
test_id=$(jq -r ".test_groups[$gi].tests[$ti].id" "$CONFIG_FILE")
arg_str="$(args_string_for_test "$gi" "$ti")"

# llama3
llama3_cmd="${prefix}./llama3 --input_bin ${LLAMA3_INPUT_BIN} --llmc_filepath ${LLAMA3_LLMC_FILEPATH} --device cuda ${arg_str}"
run_and_log "$llama3_cmd" "llama3_${test_id}${log_suffix}" "$profile_flag"
# gpt2
gpt2_cmd="${prefix}./gpt2 --input_bin ${GPT2_INPUT_BIN} --llmc_filepath ${GPT2_LLMC_FILEPATH} --device cuda ${arg_str}"
run_and_log "$gpt2_cmd" "gpt2_${test_id}${log_suffix}" "$profile_flag" "$group_tag"

# llama3
llama3_cmd="${prefix}./llama3 --input_bin ${LLAMA3_INPUT_BIN} --llmc_filepath ${LLAMA3_LLMC_FILEPATH} --device cuda ${arg_str}"
run_and_log "$llama3_cmd" "llama3_${test_id}${log_suffix}" "$profile_flag" "$group_tag"
done
done
done

Expand Down Expand Up @@ -202,3 +303,6 @@ else
echo -e "\033[1;33m or export COMPARE_LOG_DIR=/path/to/baseline_logs before running.\033[0m"
echo -e "\033[1;33m============================================================\033[0m"
fi

echo -e "\n\033[1;36m[END OF TEST] Cleaning build directory after all tests\033[0m"
clean_build_dir
Loading