diff --git a/build.sh b/build.sh index 492c033686..d1106a5bd4 100755 --- a/build.sh +++ b/build.sh @@ -263,8 +263,8 @@ if [ -z "$MODE" ]; then -Xcompiler=-DPLATFORM_DESKTOP \ -std=c++17 \ -I. -Isrc \ - -I$PYTHON_INCLUDE -I$PYBIND_INCLUDE -I$NUMPY_INCLUDE \ - -I$CUDA_HOME/include $CUDNN_IFLAG $NCCL_IFLAG -I$RAYLIB_NAME/include \ + -I"$PYTHON_INCLUDE" -I"$PYBIND_INCLUDE" -I"$NUMPY_INCLUDE" \ + -I"$CUDA_HOME/include" $CUDNN_IFLAG $NCCL_IFLAG -I"$RAYLIB_NAME/include" \ -Xcompiler=-fopenmp \ -DOBS_TENSOR_T=$OBS_TENSOR_T \ -DENV_NAME=$ENV \ @@ -291,7 +291,7 @@ elif [ "$MODE" = "cpu" ]; then -DPLATFORM_DESKTOP \ -std=c++17 \ -I. -Isrc \ - -I$PYTHON_INCLUDE -I$PYBIND_INCLUDE \ + -I"$PYTHON_INCLUDE" -I"$PYBIND_INCLUDE" \ -DOBS_TENSOR_T=$OBS_TENSOR_T \ -DENV_NAME=$ENV \ $PRECISION $LINK_OPT \ @@ -310,7 +310,7 @@ elif [ "$MODE" = "profile" ]; then echo "Compiling profile binary ($ARCH)..." $NVCC $NVCC_OPT -arch=$ARCH -std=c++17 \ -I. -Isrc -I$SRC_DIR -Ivendor \ - -I$CUDA_HOME/include $CUDNN_IFLAG $NCCL_IFLAG -I$RAYLIB_NAME/include \ + -I"$CUDA_HOME/include" $CUDNN_IFLAG $NCCL_IFLAG -I"$RAYLIB_NAME/include" \ -DOBS_TENSOR_T=$OBS_TENSOR_T \ -DENV_NAME=$ENV \ -Xcompiler=-DPLATFORM_DESKTOP \ diff --git a/config/clifford.ini b/config/clifford.ini new file mode 100644 index 0000000000..b4a3d6ddb9 --- /dev/null +++ b/config/clifford.ini @@ -0,0 +1,21 @@ +[base] +env_name = clifford + +[env] +# Must match the compile-time CLIFFORD_N_QUBITS used for build.sh. +n_qubits = 6 +difficulty = 10.0 +max_steps = 200 +single_qubit_cost = 0.001 +cz_cost = 0.1 +goal_bonus = 0.0 +failure_penalty = -1.0 +use_shortcut_gates = 1 +seed = 0 + +[policy] +hidden_size = 128 +num_layers = 2 + +[train] +total_timesteps = 10_000_000 diff --git a/examples/clifford_synthesize.py b/examples/clifford_synthesize.py new file mode 100755 index 0000000000..543741c429 --- /dev/null +++ b/examples/clifford_synthesize.py @@ -0,0 +1,309 @@ +#!/usr/bin/env python3 +import argparse +import glob +import json +import os +import sys + +import numpy as np +import torch + +REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if REPO_ROOT not in sys.path: + sys.path.insert(0, REPO_ROOT) + +from pufferlib import _C +from pufferlib.torch_pufferl import load_policy + +Action = tuple[str, int, int] +BASE_SINGLE_QUBIT_GATES = ("h", "s") +SHORTCUT_SINGLE_QUBIT_GATES = ("v", "hs", "hv") + + +def build_actions(n_qubits: int, use_shortcut_gates: bool = True) -> list[Action]: + actions: list[Action] = [] + single_qubit_gates = BASE_SINGLE_QUBIT_GATES + if use_shortcut_gates: + single_qubit_gates = single_qubit_gates + SHORTCUT_SINGLE_QUBIT_GATES + for gate in single_qubit_gates: + for qubit in range(n_qubits): + actions.append((gate, qubit, -1)) + + for src in range(n_qubits): + for dst in range(src + 1, n_qubits): + actions.append(("cz", src, dst)) + + return actions + + +def identity_symplectic(n_qubits): + return np.eye(2 * n_qubits, dtype=np.uint8) + + +def symplectic_form(n_qubits): + omega = np.zeros((2 * n_qubits, 2 * n_qubits), dtype=np.uint8) + eye = np.eye(n_qubits, dtype=np.uint8) + omega[:n_qubits, n_qubits:] = eye + omega[n_qubits:, :n_qubits] = eye + return omega + + +def is_symplectic(matrix): + matrix_u8 = np.asarray(matrix, dtype=np.uint8) + if matrix_u8.ndim != 2 or matrix_u8.shape[0] != matrix_u8.shape[1]: + return False + if matrix_u8.shape[0] % 2 != 0: + return False + n_qubits = matrix_u8.shape[0] // 2 + omega = symplectic_form(n_qubits) + lhs = (matrix_u8.T @ omega @ matrix_u8) % 2 + return bool(np.array_equal(lhs.astype(np.uint8), omega)) + + +def xor_columns_inplace(matrix, dst_idx, src_col): + np.bitwise_xor(matrix[:, dst_idx], src_col, out=matrix[:, dst_idx]) + + +def apply_action_inplace(matrix, action): + gate, q0, q1 = action + n_qubits = matrix.shape[0] // 2 + if gate == "h": + z_col = n_qubits + q0 + matrix[:, [q0, z_col]] = matrix[:, [z_col, q0]] + elif gate == "s": + xor_columns_inplace(matrix, n_qubits + q0, matrix[:, q0].copy()) + elif gate == "v": + apply_action_inplace(matrix, ("s", q0, -1)) + apply_action_inplace(matrix, ("h", q0, -1)) + apply_action_inplace(matrix, ("s", q0, -1)) + elif gate == "hs": + apply_action_inplace(matrix, ("h", q0, -1)) + apply_action_inplace(matrix, ("s", q0, -1)) + elif gate == "hv": + apply_action_inplace(matrix, ("h", q0, -1)) + apply_action_inplace(matrix, ("v", q0, -1)) + elif gate == "cz": + src_x = matrix[:, q0].copy() + dst_x = matrix[:, q1].copy() + xor_columns_inplace(matrix, n_qubits + q0, dst_x) + xor_columns_inplace(matrix, n_qubits + q1, src_x) + else: + raise ValueError(f"unknown gate {gate}") + + +def latest_checkpoint(checkpoint_dir): + pattern = os.path.join(checkpoint_dir, "clifford", "**", "*.bin") + candidates = glob.glob(pattern, recursive=True) + if not candidates: + raise FileNotFoundError(f"No checkpoints found at {pattern}") + return max(candidates, key=os.path.getctime) + + +def n_qubits_from_matrix(matrix): + if ( + matrix.ndim != 2 + or matrix.shape[0] != matrix.shape[1] + or matrix.shape[0] % 2 != 0 + ): + raise ValueError(f"expected an even square tableau, got {matrix.shape}") + return matrix.shape[0] // 2 + + +def load_matrix(path, n_qubits=None): + if path.endswith(".npy"): + matrix = np.load(path) + else: + with open(path) as f: + matrix = np.asarray(json.load(f), dtype=np.uint8) + matrix = np.ascontiguousarray(matrix, dtype=np.uint8) + matrix_n_qubits = n_qubits_from_matrix(matrix) + if n_qubits is not None and matrix_n_qubits != n_qubits: + expected_shape = (2 * n_qubits, 2 * n_qubits) + raise ValueError(f"expected a {expected_shape} tableau, got {matrix.shape}") + if not is_symplectic(matrix): + raise ValueError("matrix is not symplectic") + return matrix + + +def random_tableau(n_qubits, seed, random_steps, use_shortcut_gates=True): + if random_steps <= 0: + return identity_symplectic(n_qubits) + + actions = build_actions(n_qubits, use_shortcut_gates=use_shortcut_gates) + rng = np.random.default_rng(seed) + while True: + matrix = identity_symplectic(n_qubits) + for _ in range(random_steps): + apply_action_inplace(matrix, actions[int(rng.integers(len(actions)))]) + if not np.array_equal(matrix, identity_symplectic(n_qubits)): + return matrix + + +def default_checkpoint_dir(n_qubits, hidden_size, use_shortcut_gates=True): + action_suffix = "" if use_shortcut_gates else "_hs_cz" + return f"checkpoints/clifford_{n_qubits}q{action_suffix}_mlp{hidden_size}_long" + + +def make_policy_args(args, checkpoint, n_qubits): + return { + "env_name": "clifford", + "checkpoint_dir": args.checkpoint_dir, + "load_model_path": checkpoint, + "load_id": None, + "wandb": False, + "vec": { + "total_agents": 1, + "num_buffers": 1, + "num_threads": 1, + }, + "env": { + "n_qubits": n_qubits, + "difficulty": 0, + "max_steps": args.max_steps + 1, + "single_qubit_cost": 0.001, + "cz_cost": 0.1, + "goal_bonus": 0.0, + "failure_penalty": -1.0, + "use_shortcut_gates": int(args.use_shortcut_gates), + "seed": args.seed, + }, + "policy": { + "hidden_size": args.hidden_size, + "num_layers": args.num_layers, + "expansion_factor": 1, + }, + "torch": { + "network": args.network, + "encoder": "DefaultEncoder", + "decoder": "DefaultDecoder", + }, + } + + +def synthesize(policy, matrix, max_steps, use_shortcut_gates=True): + matrix = matrix.copy() + n_qubits = matrix.shape[0] // 2 + actions = build_actions(n_qubits, use_shortcut_gates=use_shortcut_gates) + identity = identity_symplectic(n_qubits) + device = next(policy.parameters()).device + state = policy.initial_state(1, device) + sequence = [] + + policy.eval() + with torch.no_grad(): + for step in range(max_steps + 1): + if np.array_equal(matrix, identity): + return sequence, True + obs_t = torch.as_tensor(matrix.reshape(1, -1), device=device) + logits, _value, state = policy.forward_eval(obs_t, state) + action_idx = int(torch.argmax(logits, dim=-1).item()) + sequence.append(actions[action_idx]) + apply_action_inplace(matrix, actions[action_idx]) + if np.array_equal(matrix, identity): + return sequence, True + + return sequence, False + + +def main(): + parser = argparse.ArgumentParser( + description="Synthesize Clifford tableaus with a trained Puffer policy" + ) + parser.add_argument( + "--checkpoint", default="latest", help="Checkpoint path, or 'latest'" + ) + parser.add_argument( + "--checkpoint-dir", + help="Checkpoint directory. Defaults to checkpoints/clifford_q_mlp_long", + ) + parser.add_argument( + "--matrix", help="Path to a .npy or JSON tableau. Omit for random." + ) + parser.add_argument( + "--n-qubits", + type=int, + help="Number of qubits. Inferred from --matrix, otherwise defaults to 3.", + ) + parser.add_argument("--random-steps", type=int, default=12) + parser.add_argument("--seed", type=int, default=1) + parser.add_argument("--max-steps", type=int, default=64) + parser.add_argument("--hidden-size", type=int, default=128) + parser.add_argument("--num-layers", type=int, default=2) + parser.add_argument("--network", default="MLP") + parser.add_argument("--use-shortcut-gates", action="store_true", default=True) + parser.add_argument( + "--no-shortcut-gates", action="store_false", dest="use_shortcut_gates" + ) + args = parser.parse_args() + + if args.n_qubits is not None and args.n_qubits <= 0: + raise ValueError("--n-qubits must be positive") + + if args.matrix: + matrix = load_matrix(args.matrix, n_qubits=args.n_qubits) + n_qubits = n_qubits_from_matrix(matrix) + else: + n_qubits = args.n_qubits or 3 + matrix = random_tableau( + n_qubits, + args.seed, + args.random_steps, + use_shortcut_gates=args.use_shortcut_gates, + ) + + if args.checkpoint_dir is None: + args.checkpoint_dir = default_checkpoint_dir( + n_qubits, + args.hidden_size, + use_shortcut_gates=args.use_shortcut_gates, + ) + + if getattr(_C, "env_name", None) != "clifford": + raise RuntimeError( + "Build Clifford first, e.g. " + f"EXTRA_CFLAGS='-DCLIFFORD_N_QUBITS={n_qubits} " + f"-DCLIFFORD_USE_SHORTCUT_GATES={int(args.use_shortcut_gates)}' " + "bash build.sh clifford --cpu" + ) + + checkpoint = ( + latest_checkpoint(args.checkpoint_dir) + if args.checkpoint == "latest" + else args.checkpoint + ) + policy_args = make_policy_args(args, checkpoint, n_qubits) + vec = _C.create_vec(policy_args, 0) + try: + expected_obs_size = (2 * n_qubits) ** 2 + expected_actions = len( + build_actions(n_qubits, use_shortcut_gates=args.use_shortcut_gates) + ) + if vec.obs_size != expected_obs_size or vec.act_sizes != [expected_actions]: + raise RuntimeError( + f"This synthesizer needs a {n_qubits}-qubit Clifford build " + f"(obs_size={expected_obs_size}, act_sizes={[expected_actions]}); " + f"got obs_size={vec.obs_size}, act_sizes={vec.act_sizes}" + ) + policy = load_policy(policy_args, vec) + finally: + vec.close() + + sequence, solved = synthesize( + policy, + matrix, + args.max_steps, + use_shortcut_gates=args.use_shortcut_gates, + ) + for idx, (gate, q0, q1) in enumerate(sequence, 1): + if q1 < 0: + print(f"{idx:02d}: {gate} {q0}") + else: + print(f"{idx:02d}: {gate} {q0} {q1}") + + print(f"solved={solved} steps={len(sequence)} checkpoint={checkpoint}") + if not solved: + raise SystemExit(1) + + +if __name__ == "__main__": + main() diff --git a/ocean/clifford/binding.c b/ocean/clifford/binding.c new file mode 100644 index 0000000000..99559a9ef9 --- /dev/null +++ b/ocean/clifford/binding.c @@ -0,0 +1,131 @@ +#include "clifford.h" + +#define OBS_SIZE CLIFFORD_OBS_SIZE +#define NUM_ATNS 1 +#define ACT_SIZES {CLIFFORD_NUM_ACTIONS} +#define OBS_TENSOR_T ByteTensor + +#define Env CliffordEnv +#define MY_VEC_INIT +#define MY_VEC_CLOSE +#include "vecenv.h" + +static double dict_get_or(Dict* dict, const char* key, double fallback) { + DictItem* item = dict_get_unsafe(dict, key); + return item == NULL ? fallback : item->value; +} + +Env* my_vec_init( + int* num_envs_out, + int* buffer_env_starts, + int* buffer_env_counts, + Dict* vec_kwargs, + Dict* env_kwargs) { + int total_agents = (int)dict_get(vec_kwargs, "total_agents")->value; + int num_buffers = (int)dict_get(vec_kwargs, "num_buffers")->value; + assert(total_agents > 0); + assert(num_buffers > 0); + assert(total_agents % num_buffers == 0); + + int requested_n_qubits = (int)dict_get_or(env_kwargs, "n_qubits", CLIFFORD_N_QUBITS); + if (requested_n_qubits != CLIFFORD_N_QUBITS) { + fprintf(stderr, + "clifford is compiled for n_qubits=%d, got n_qubits=%d\n", + CLIFFORD_N_QUBITS, requested_n_qubits); + } + assert(requested_n_qubits == CLIFFORD_N_QUBITS); + int requested_shortcuts = (int)dict_get_or(env_kwargs, "use_shortcut_gates", CLIFFORD_USE_SHORTCUT_GATES); + if (requested_shortcuts != CLIFFORD_USE_SHORTCUT_GATES) { + fprintf(stderr, + "Clifford env was compiled with CLIFFORD_USE_SHORTCUT_GATES=%d but got use_shortcut_gates=%d\n", + CLIFFORD_USE_SHORTCUT_GATES, requested_shortcuts); + } + assert(requested_shortcuts == CLIFFORD_USE_SHORTCUT_GATES); + + CliffordVecEnv* shared = (CliffordVecEnv*)calloc(1, sizeof(CliffordVecEnv)); + assert(shared != NULL); + shared->num_envs = total_agents; + shared->n_qubits = CLIFFORD_N_QUBITS; + shared->dim = CLIFFORD_DIM; + set_difficulty_level(shared, dict_get_or(env_kwargs, "difficulty", 10.0)); + shared->max_steps = (int)dict_get_or(env_kwargs, "max_steps", 200.0); + shared->single_qubit_cost = (float)dict_get_or(env_kwargs, "single_qubit_cost", 0.001); + shared->cz_cost = (float)dict_get_or(env_kwargs, "cz_cost", 0.1); + shared->goal_bonus = (float)dict_get_or(env_kwargs, "goal_bonus", 0.0); + shared->failure_penalty = (float)dict_get_or(env_kwargs, "failure_penalty", -1.0); + assert(shared->max_steps > 0); + assert(shared->single_qubit_cost >= 0.0f); + assert(shared->cz_cost >= 0.0f); + assert(shared->goal_bonus >= 0.0f); + assert(shared->failure_penalty <= 0.0f); + + build_clifford_actions(shared); + assert(shared->actions != NULL); + for (int col = 0; col < shared->dim; ++col) { + shared->identity_cols[col] = 1ULL << col; + } + + uint64_t seed = (uint64_t)(uint32_t)dict_get_or(env_kwargs, "seed", 0.0); + + Env* envs = (Env*)calloc((size_t)total_agents, sizeof(Env)); + assert(envs != NULL); + shared->envs = envs; + + int agents_per_buffer = total_agents / num_buffers; + for (int buf = 0; buf < num_buffers; ++buf) { + buffer_env_starts[buf] = buf * agents_per_buffer; + buffer_env_counts[buf] = agents_per_buffer; + } + + uint64_t seed_state = seed; + for (int env_idx = 0; env_idx < total_agents; ++env_idx) { + Env* env = &envs[env_idx]; + env->vec = shared; + env->num_agents = 1; + env->cols = (uint64_t*)calloc((size_t)shared->dim, sizeof(uint64_t)); + assert(env->cols != NULL); + rng_seed(&env->rng, splitmix64_next(&seed_state) ^ (uint64_t)(env_idx + 1)); + } + + *num_envs_out = total_agents; + return envs; +} + +void my_vec_close(Env* envs) { + if (envs == NULL) { + return; + } + CliffordVecEnv* shared = envs[0].vec; + if (shared == NULL) { + return; + } + for (int env_idx = 0; env_idx < shared->num_envs; ++env_idx) { + free(envs[env_idx].cols); + envs[env_idx].cols = NULL; + } + free(shared->actions); + free(shared); +} + +void my_log(Log* log, Dict* out) { + dict_set(out, "perf", log->perf); + dict_set(out, "score", log->score); + dict_set(out, "episode_return", log->episode_return); + dict_set(out, "episode_length", log->episode_length); + dict_set(out, "mean_cz", log->episode_cz_sum); + dict_set(out, "success_rate", log->success_rate); + dict_set(out, "difficulty", log->difficulty); + dict_set(out, "max_steps", log->max_steps); + + float success_rate = log->success_count; + float mean_success_steps = success_rate > 0.0f ? log->success_step_sum / success_rate : 0.0f; + float mean_success_cz = success_rate > 0.0f ? log->success_cz_sum / success_rate : 0.0f; + float success_step_second = success_rate > 0.0f ? log->success_step_sq_sum / success_rate : 0.0f; + float success_step_var = success_step_second - mean_success_steps * mean_success_steps; + if (success_step_var < 0.0f) { + success_step_var = 0.0f; + } + dict_set(out, "success_step_mean", mean_success_steps); + dict_set(out, "success_step_std", sqrtf(success_step_var)); + dict_set(out, "mean_success_cz", mean_success_cz); +} diff --git a/ocean/clifford/clifford.c b/ocean/clifford/clifford.c new file mode 100644 index 0000000000..f17a2bdce3 --- /dev/null +++ b/ocean/clifford/clifford.c @@ -0,0 +1,44 @@ +#include +#include "clifford.h" + +int main(void) { + CliffordVecEnv shared = {0}; + shared.num_envs = 1; + shared.n_qubits = CLIFFORD_N_QUBITS; + shared.dim = CLIFFORD_DIM; + set_difficulty_level(&shared, 10.0); + shared.max_steps = 200; + shared.single_qubit_cost = 0.001f; + shared.cz_cost = 0.1f; + shared.goal_bonus = 0.0f; + shared.failure_penalty = -1.0f; + build_clifford_actions(&shared); + for (int col = 0; col < shared.dim; ++col) { + shared.identity_cols[col] = 1ULL << col; + } + + unsigned char observations[CLIFFORD_OBS_SIZE] = {0}; + float actions[1] = {0}; + float rewards[1] = {0}; + float terminals[1] = {0}; + CliffordEnv env = { + .cols = (uint64_t*)calloc((size_t)shared.dim, sizeof(uint64_t)), + .vec = &shared, + .observations = observations, + .actions = actions, + .rewards = rewards, + .terminals = terminals, + .num_agents = 1, + }; + rng_seed(&env.rng, 1); + c_reset(&env); + for (int step = 0; step < 1000; ++step) { + actions[0] = (float)sample_action(&shared, &env.rng); + c_step(&env); + } + + printf("clifford smoke complete: reward=%f terminal=%f\n", rewards[0], terminals[0]); + free(env.cols); + free(shared.actions); + return 0; +} diff --git a/ocean/clifford/clifford.h b/ocean/clifford/clifford.h new file mode 100644 index 0000000000..9f55d97f83 --- /dev/null +++ b/ocean/clifford/clifford.h @@ -0,0 +1,343 @@ +#ifndef PUFFERLIB_OCEAN_CLIFFORD_CLIFFORD_H +#define PUFFERLIB_OCEAN_CLIFFORD_CLIFFORD_H + +#include +#include +#include +#include + +#ifndef CLIFFORD_N_QUBITS +#define CLIFFORD_N_QUBITS 6 +#endif +#if CLIFFORD_N_QUBITS < 1 || CLIFFORD_N_QUBITS > 32 +#error "CLIFFORD_N_QUBITS must be in [1, 32]; tableau rows are stored in uint64_t" +#endif +#define CLIFFORD_DIM (2 * CLIFFORD_N_QUBITS) +#define CLIFFORD_OBS_SIZE (CLIFFORD_DIM * CLIFFORD_DIM) +#ifndef CLIFFORD_USE_SHORTCUT_GATES +#define CLIFFORD_USE_SHORTCUT_GATES 1 +#endif +#define CLIFFORD_SINGLE_QUBIT_ACTIONS (CLIFFORD_USE_SHORTCUT_GATES ? 5 : 2) +#define CLIFFORD_NUM_ACTIONS (CLIFFORD_SINGLE_QUBIT_ACTIONS * CLIFFORD_N_QUBITS + (CLIFFORD_N_QUBITS * (CLIFFORD_N_QUBITS - 1)) / 2) + +#define GATE_H 0 +#define GATE_S 1 +#define GATE_V 2 +#define GATE_HS 3 +#define GATE_HV 4 +#define GATE_CZ 5 + +typedef struct { + int gate_kind; + int q0; + int q1; +} CliffordAction; + +typedef struct Log { + float perf; + float score; + float episode_return; + float episode_length; + float episode_cz_sum; + float success_rate; + float difficulty; + float max_steps; + float n; + float success_count; + float success_step_sum; + float success_step_sq_sum; + float success_cz_sum; +} Log; + +typedef struct { + uint64_t state; +} XorShift64; + +typedef struct CliffordVecEnv CliffordVecEnv; + +typedef struct { + Log log; + uint64_t* cols; + CliffordVecEnv* vec; + unsigned char* observations; + float* actions; + float* rewards; + float* terminals; + int num_agents; + float episode_return; + int episode_length; + int episode_cz_count; + int steps; + int episode_max_steps; + XorShift64 rng; +} CliffordEnv; + +struct CliffordVecEnv { + CliffordEnv* envs; + int num_envs; + int n_qubits; + int dim; + int difficulty; + float difficulty_fraction; + int max_steps; + int num_actions; + CliffordAction* actions; + uint64_t identity_cols[64]; + float single_qubit_cost; + float cz_cost; + float goal_bonus; + float failure_penalty; +}; + +static inline uint64_t splitmix64_next(uint64_t* state) { + uint64_t z = (*state += 0x9E3779B97F4A7C15ULL); + z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ULL; + z = (z ^ (z >> 27)) * 0x94D049BB133111EBULL; + return z ^ (z >> 31); +} + +static inline void rng_seed(XorShift64* rng, uint64_t seed) { + if (seed == 0) { + seed = 0x123456789ABCDEFULL; + } + rng->state = seed; +} + +static inline uint64_t rng_next_u64(XorShift64* rng) { + uint64_t x = rng->state; + x ^= x >> 12; + x ^= x << 25; + x ^= x >> 27; + rng->state = x; + return x * 2685821657736338717ULL; +} + +static inline int rng_below(XorShift64* rng, int upper) { + if (upper <= 1) { + return 0; + } + return (int)(rng_next_u64(rng) % (uint64_t)upper); +} + +static inline float rng_float01(XorShift64* rng) { + return (float)((rng_next_u64(rng) >> 40) * (1.0 / 16777216.0)); +} + +static inline void set_difficulty_level(CliffordVecEnv* vec, double difficulty_level) { + if (difficulty_level < 0.0) { + difficulty_level = 0.0; + } + double floor_level = floor(difficulty_level + 1e-12); + vec->difficulty = (int)floor_level; + vec->difficulty_fraction = (float)(difficulty_level - floor_level); + if (vec->difficulty_fraction <= 1e-6f) { + vec->difficulty_fraction = 0.0f; + } else if (vec->difficulty_fraction >= 1.0f - 1e-6f) { + vec->difficulty += 1; + vec->difficulty_fraction = 0.0f; + } +} + +static inline int effective_reset_difficulty(const CliffordVecEnv* vec) { + return vec->difficulty + (vec->difficulty_fraction > 0.0f ? 1 : 0); +} + +static inline int sample_reset_difficulty(const CliffordVecEnv* vec, XorShift64* rng) { + if (vec->difficulty_fraction <= 0.0f) { + return vec->difficulty; + } + return vec->difficulty + (rng_float01(rng) < vec->difficulty_fraction ? 1 : 0); +} + +static inline void copy_identity_cols(const CliffordVecEnv* vec, CliffordEnv* env) { + memcpy(env->cols, vec->identity_cols, (size_t)vec->dim * sizeof(uint64_t)); +} + +static inline void copy_cols(const CliffordVecEnv* vec, uint64_t* dst, const uint64_t* src) { + memcpy(dst, src, (size_t)vec->dim * sizeof(uint64_t)); +} + +static inline void reset_episode_state(const CliffordVecEnv* vec, CliffordEnv* env) { + env->steps = 0; + env->episode_max_steps = vec->max_steps; + env->episode_return = 0.0f; + env->episode_length = 0; + env->episode_cz_count = 0; +} + +static inline int is_identity(const CliffordVecEnv* vec, const CliffordEnv* env) { + for (int col = 0; col < vec->dim; ++col) { + if (env->cols[col] != vec->identity_cols[col]) { + return 0; + } + } + return 1; +} + +static inline int sample_action(const CliffordVecEnv* vec, XorShift64* rng) { + if (vec->num_actions <= 0) { + return -1; + } + return rng_below(rng, vec->num_actions); +} + +static inline void build_clifford_actions(CliffordVecEnv* vec) { + vec->num_actions = CLIFFORD_NUM_ACTIONS; + vec->actions = (CliffordAction*)calloc((size_t)vec->num_actions, sizeof(CliffordAction)); + if (vec->actions == NULL) { + return; + } + + int idx = 0; + const int max_single_gate = CLIFFORD_USE_SHORTCUT_GATES ? GATE_HV : GATE_S; + for (int gate = GATE_H; gate <= max_single_gate; ++gate) { + for (int qubit = 0; qubit < vec->n_qubits; ++qubit) { + vec->actions[idx++] = (CliffordAction){.gate_kind = gate, .q0 = qubit, .q1 = -1}; + } + } + for (int src = 0; src < vec->n_qubits; ++src) { + for (int dst = src + 1; dst < vec->n_qubits; ++dst) { + vec->actions[idx++] = (CliffordAction){.gate_kind = GATE_CZ, .q0 = src, .q1 = dst}; + } + } +} + +static inline void apply_action(const CliffordVecEnv* vec, CliffordEnv* env, int action_idx) { + const CliffordAction* action = &vec->actions[action_idx]; + const int n = vec->n_qubits; + const int q = action->q0; + const uint64_t x = env->cols[q]; + const uint64_t z = env->cols[n + q]; + if (action->gate_kind == GATE_H) { + env->cols[q] = z; + env->cols[n + q] = x; + } else if (action->gate_kind == GATE_S) { + env->cols[q] = x; + env->cols[n + q] = z ^ x; + } else if (action->gate_kind == GATE_V) { + env->cols[q] = x ^ z; + env->cols[n + q] = z; + } else if (action->gate_kind == GATE_HS) { + env->cols[q] = z; + env->cols[n + q] = x ^ z; + } else if (action->gate_kind == GATE_HV) { + env->cols[q] = x ^ z; + env->cols[n + q] = x; + } else { + env->cols[n + action->q0] ^= env->cols[action->q1]; + env->cols[n + action->q1] ^= env->cols[action->q0]; + } +} + +static inline void write_observation(const CliffordVecEnv* vec, CliffordEnv* env) { + const int dim = vec->dim; + for (int row = 0; row < dim; ++row) { + for (int col = 0; col < dim; ++col) { + env->observations[row * dim + col] = (unsigned char)((env->cols[col] >> row) & 1ULL); + } + } +} + +static inline void reset_single(CliffordVecEnv* vec, CliffordEnv* env) { + reset_episode_state(vec, env); + const int reset_difficulty = sample_reset_difficulty(vec, &env->rng); + if (reset_difficulty <= 0) { + copy_identity_cols(vec, env); + write_observation(vec, env); + return; + } + + do { + copy_identity_cols(vec, env); + for (int step = 0; step < reset_difficulty; ++step) { + const int action_idx = sample_action(vec, &env->rng); + if (action_idx < 0) { + break; + } + apply_action(vec, env, action_idx); + } + } while (is_identity(vec, env)); + + write_observation(vec, env); +} + +static inline void add_log(CliffordVecEnv* vec, CliffordEnv* env, int success) { + Log* log = &env->log; + float difficulty_level = (float)vec->difficulty + vec->difficulty_fraction; + log->perf += success ? 1.0f : 0.0f; + log->score += env->episode_return; + log->episode_return += env->episode_return; + log->episode_length += (float)env->episode_length; + log->episode_cz_sum += (float)env->episode_cz_count; + log->success_rate += success ? 1.0f : 0.0f; + log->difficulty += difficulty_level; + log->max_steps += (float)vec->max_steps; + log->n += 1.0f; + if (success) { + log->success_count += 1.0f; + log->success_step_sum += (float)env->episode_length; + log->success_step_sq_sum += (float)(env->episode_length * env->episode_length); + log->success_cz_sum += (float)env->episode_cz_count; + } +} + +static inline float gate_cost_reward(const CliffordVecEnv* vec, int gate_kind) { + return gate_kind == GATE_CZ ? -vec->cz_cost : -vec->single_qubit_cost; +} + +static inline float step_single(CliffordVecEnv* vec, CliffordEnv* env) { + int action_idx = ((int)env->actions[0]) % vec->num_actions; + if (action_idx < 0) { + action_idx += vec->num_actions; + } + const CliffordAction* action = &vec->actions[action_idx]; + const int gate_kind = action->gate_kind; + apply_action(vec, env, action_idx); + env->steps += 1; + env->episode_length += 1; + if (gate_kind == GATE_CZ) { + env->episode_cz_count += 1; + } + + int terminated = is_identity(vec, env); + int truncated = (!terminated && env->steps >= env->episode_max_steps); + float reward = gate_cost_reward(vec, gate_kind); + if (terminated) { + reward += vec->goal_bonus; + } else if (truncated) { + reward += vec->failure_penalty; + } + env->episode_return += reward; + env->rewards[0] = reward; + env->terminals[0] = (float)(terminated || truncated); + + if (terminated || truncated) { + add_log(vec, env, terminated); + reset_single(vec, env); + } else { + write_observation(vec, env); + } + return reward; +} + +static inline void c_reset(CliffordEnv* env) { + env->rewards[0] = 0.0f; + env->terminals[0] = 0.0f; + reset_single(env->vec, env); +} + +static inline void c_step(CliffordEnv* env) { + env->rewards[0] = 0.0f; + env->terminals[0] = 0.0f; + step_single(env->vec, env); +} + +static inline void c_render(CliffordEnv* env) { + (void)env; +} + +static inline void c_close(CliffordEnv* env) { + (void)env; +} + +#endif diff --git a/scripts/train_clifford_curriculum.py b/scripts/train_clifford_curriculum.py new file mode 100755 index 0000000000..cbf6b32597 --- /dev/null +++ b/scripts/train_clifford_curriculum.py @@ -0,0 +1,643 @@ +#!/usr/bin/env python3 +import glob +import json +import math +import os +import sys +import time +import copy +from collections import defaultdict + +import rich +import torch + +REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if REPO_ROOT not in sys.path: + sys.path.insert(0, REPO_ROOT) + +import pufferlib.pufferl +from pufferlib import _C +from pufferlib.torch_pufferl import PuffeRL, load_policy + + +PROFILE_DEFAULTS = { + "fast": { + "HIDDEN_SIZE": 128, + "TOTAL_AGENTS": 256, + "HORIZON": 32, + "MINIBATCH_SIZE": 8192, + "LEARNING_RATE": 0.005, + "ENT_COEF": 0.1, + "MASTERY_THRESHOLD": 0.95, + "FINAL_MASTERY_THRESHOLD": 0.95, + }, + "steady": { + "HIDDEN_SIZE": 256, + "TOTAL_AGENTS": 1024, + "HORIZON": 128, + "MINIBATCH_SIZE": 8192, + "LEARNING_RATE": 0.002, + "ENT_COEF": 0.001, + "FIRST_STAGE_TIMESTEPS": 5_000_000, + "FIRST_STAGE_MIN_TIMESTEPS": 500_000, + "TIMESTEPS_PER_STAGE": 500_000, + "MIN_TIMESTEPS_PER_STAGE": 500_000, + "MASTERY_THRESHOLD": 0.95, + "FINAL_MASTERY_THRESHOLD": 0.95, + }, +} + + +def curriculum_profile(): + profile = os.environ.get("CURRICULUM_PROFILE", "auto").strip().lower() + if profile == "auto": + n_qubits = int(os.environ.get("N_QUBITS", 3)) + return "fast" if n_qubits <= 3 else "steady" + if profile not in PROFILE_DEFAULTS: + names = ", ".join(["auto", *PROFILE_DEFAULTS]) + raise ValueError(f"CURRICULUM_PROFILE must be one of: {names}") + return profile + + +def env_value(name, default): + if name in os.environ: + return os.environ[name] + return PROFILE_DEFAULTS.get(curriculum_profile(), {}).get(name, default) + + +def env_int(name, default): + return int(env_value(name, default)) + + +def env_float(name, default): + return float(env_value(name, default)) + + +def env_str(name, default): + return str(env_value(name, default)) + + +def latest_checkpoint(checkpoint_dir): + pattern = os.path.join(checkpoint_dir, "clifford", "**", "*.bin") + candidates = glob.glob(pattern, recursive=True) + if not candidates: + return None + return max(candidates, key=os.path.getctime) + + +def stage_timesteps(difficulty, stage_idx=None): + first_stage_override = env_value("FIRST_STAGE_TIMESTEPS", None) + if stage_idx == 0 and first_stage_override is not None: + return int(first_stage_override) + + override = env_value("TIMESTEPS_PER_STAGE", None) + if override is not None: + return int(override) + if difficulty <= 3: + return 8_000_000 + if difficulty <= 6: + return 12_000_000 + return 16_000_000 + + +def stage_min_timesteps(stage_idx): + first_stage_override = env_value("FIRST_STAGE_MIN_TIMESTEPS", None) + if stage_idx == 0 and first_stage_override is not None: + return int(first_stage_override) + return env_int("MIN_TIMESTEPS_PER_STAGE", 0) + + +def stage_max_steps(difficulty, success_step_stats=None): + return stage_max_steps_with_source(difficulty, success_step_stats)[0] + + +def stage_max_steps_with_source(difficulty, success_step_stats=None): + override = os.environ.get("MAX_STEPS") + if override is not None: + return int(override), f"MAX_STEPS={override}" + + min_steps = env_int("MIN_MAX_STEPS", 4) + base_slack = env_float("MAX_STEPS_BASE_SLACK", 2) + headroom = env_float("MAX_STEPS_HEADROOM", 0) + max_steps = max(min_steps, math.ceil(difficulty + base_slack)) + source = f"ceil(difficulty + MAX_STEPS_BASE_SLACK)={max_steps}" + if headroom > 0: + headroom_steps = math.ceil(difficulty * headroom) + if headroom_steps > max_steps: + max_steps = headroom_steps + source = f"ceil(difficulty * MAX_STEPS_HEADROOM)={max_steps}" + + stddevs = env_float("MAX_STEPS_STDDEVS", 0) + stddevs_after = env_float("MAX_STEPS_STDDEVS_AFTER_DIFFICULTY", 0) + if stddevs > 0 and difficulty >= stddevs_after and success_step_stats is not None: + mean_steps, std_steps = success_step_stats + stddev_steps = math.ceil(mean_steps + stddevs * std_steps) + if stddev_steps > max_steps: + max_steps = stddev_steps + source = ( + f"prev_success_step_mean={mean_steps:.3f} + " + f"{stddevs:g} * prev_success_step_std={std_steps:.3f} " + f"=> {max_steps}" + ) + + force_after = env_float("FORCE_MAX_STEPS_AFTER_DIFFICULTY", -1) + force_value = env_int("FORCE_MAX_STEPS_VALUE", 1000) + if force_after >= 0 and difficulty >= force_after: + return force_value, f"FORCE_MAX_STEPS_VALUE={force_value}" + + slack_after = env_float("MAX_STEPS_SLACK_AFTER_DIFFICULTY", 0) + slack = env_int("MAX_STEPS_SLACK", 0) if difficulty >= slack_after else 0 + if slack: + source = f"{source} + MAX_STEPS_SLACK={slack}" + return max_steps + slack, source + + +def format_difficulty(difficulty): + return f"{difficulty:g}" + + +def reached_mastery(perf, stop_threshold, stage_steps, min_timesteps): + if not math.isfinite(perf): + return False + if perf >= 1.0: + return True + return stage_steps >= min_timesteps and perf >= stop_threshold + + +def reached_threshold(perf, stop_threshold): + return math.isfinite(perf) and perf >= stop_threshold + + +def curriculum_difficulties(max_difficulty, stride): + if max_difficulty <= 0: + return [] + if stride <= 0: + raise ValueError("CURRICULUM_STRIDE must be > 0") + + difficulties = [] + current = 1.0 + while current <= max_difficulty + 1e-9: + difficulties.append(round(current, 10)) + current += stride + + if difficulties and difficulties[-1] > max_difficulty: + difficulties[-1] = max_difficulty + elif not difficulties or difficulties[-1] < max_difficulty - 1e-9: + difficulties.append(max_difficulty) + + return difficulties + + +def expected_actions(n_qubits, use_shortcut_gates): + single_qubit_actions = 5 if use_shortcut_gates else 2 + return single_qubit_actions * n_qubits + n_qubits * (n_qubits - 1) // 2 + + +def default_run_name(n_qubits, hidden_size, use_shortcut_gates=True): + action_suffix = "" if use_shortcut_gates else "_hs_cz" + return f"clifford_{n_qubits}q{action_suffix}_mlp{hidden_size}_long" + + +def build_args(difficulty, max_steps, total_timesteps, load_model_path=None): + n_qubits = env_int("N_QUBITS", 3) + use_shortcut_gates = env_int("USE_SHORTCUT_GATES", 1) + hidden_size = env_int("HIDDEN_SIZE", 128) + run_name = default_run_name( + n_qubits, + hidden_size, + use_shortcut_gates=bool(use_shortcut_gates), + ) + return { + "env_name": "clifford", + "rank": 0, + "world_size": 1, + "gpu_id": 0, + "profile": False, + "checkpoint_dir": env_str( + "CHECKPOINT_DIR", os.path.join("checkpoints", run_name) + ), + "log_dir": env_str("LOG_DIR", os.path.join("logs", run_name)), + "checkpoint_interval": 1, + "eval_episodes": env_int("TOTAL_AGENTS", 256), + "reset_state": True, + "load_model_path": load_model_path, + "load_enemy_model_path": None, + "load_id": None, + "wandb": False, + "slowly": True, + "render_mode": "auto", + "vec": { + "total_agents": env_int("TOTAL_AGENTS", 256), + "num_buffers": 1, + "num_threads": env_int("NUM_THREADS", 1), + }, + "env": { + "n_qubits": n_qubits, + "difficulty": difficulty, + "max_steps": max_steps, + "single_qubit_cost": env_float("SINGLE_QUBIT_COST", 0.001), + "cz_cost": env_float("CZ_COST", 0.1), + "goal_bonus": env_float("GOAL_BONUS", 1.0), + "failure_penalty": env_float("FAILURE_PENALTY", -1.0), + "use_shortcut_gates": use_shortcut_gates, + "seed": env_int("SEED", 1), + }, + "policy": { + "hidden_size": hidden_size, + "num_layers": env_int("NUM_LAYERS", 2), + "expansion_factor": 1, + }, + "torch": { + "network": env_str("NETWORK", "MLP"), + "encoder": "DefaultEncoder", + "decoder": "DefaultDecoder", + }, + "train": { + "gpus": 1, + "seed": env_int("SEED", 1), + "total_timesteps": total_timesteps, + "learning_rate": env_float("LEARNING_RATE", 0.005), + "anneal_lr": 0, + "min_lr_ratio": 0.0, + "gamma": env_float("GAMMA", 0.995), + "gae_lambda": env_float("GAE_LAMBDA", 0.90), + "replay_ratio": env_float("REPLAY_RATIO", 1.0), + "clip_coef": env_float("CLIP_COEF", 0.2), + "vf_coef": env_float("VF_COEF", 2.0), + "vf_clip_coef": 0.2, + "max_grad_norm": env_float("MAX_GRAD_NORM", 1.5), + "ent_coef": env_float("ENT_COEF", 0.1), + "beta1": 0.95, + "beta2": 0.999, + "eps": 1e-12, + "minibatch_size": env_int("MINIBATCH_SIZE", 8192), + "horizon": env_int("HORIZON", 32), + "vtrace_rho_clip": 1.0, + "vtrace_c_clip": 1.0, + "prio_alpha": 0.8, + "prio_beta0": 0.2, + }, + "sweep": { + "metric": "score", + "downsample": 5, + }, + } + + +def flatten_logs(logs): + return dict(pufferlib.pufferl.unroll_nested_dict(logs)) + + +def save_metrics(log_dir, run_id, args, logs): + os.makedirs(os.path.join(log_dir, "clifford"), exist_ok=True) + metrics = defaultdict(list) + for log in logs: + for key, value in log.items(): + try: + value = float(value) + except (TypeError, ValueError): + pass + metrics[key].append(value) + + path = os.path.join(log_dir, "clifford", f"{run_id}.json") + with open(path, "w") as f: + json.dump({**args, "metrics": dict(metrics)}, f) + + +def save_checkpoint(pufferl, checkpoint_dir, run_id, global_step): + directory = os.path.join(checkpoint_dir, "clifford", run_id) + os.makedirs(directory, exist_ok=True) + path = os.path.join(directory, f"{global_step:016d}.bin") + pufferl.save_weights(path) + return path + + +def create_pufferl(args, policy, optimizer): + vec = _C.create_vec(args, 0) + n_qubits = args["env"]["n_qubits"] + expected_obs_size = (2 * n_qubits) ** 2 + expected_act_sizes = [ + expected_actions(n_qubits, args["env"].get("use_shortcut_gates", 0)) + ] + if vec.obs_size != expected_obs_size or vec.act_sizes != expected_act_sizes: + vec.close() + raise RuntimeError( + f"Expected a {n_qubits}-qubit Clifford build with " + f"obs_size={expected_obs_size} and act_sizes={expected_act_sizes}; " + f"got obs_size={vec.obs_size}, act_sizes={vec.act_sizes}" + ) + + if policy is None: + policy = load_policy(args, vec) + + pufferl = PuffeRL(args, vec, policy, verbose=False) + optimizer_name = str(env_str("OPTIMIZER", "adamw")).lower() + if optimizer is not None: + pufferl.optimizer = optimizer + pufferl.optimizer.param_groups[0]["lr"] = args["train"]["learning_rate"] + elif optimizer_name == "adamw": + pufferl.optimizer = torch.optim.AdamW( # type: ignore[assignment] + pufferl.policy.parameters(), + lr=args["train"]["learning_rate"], + betas=(args["train"].get("beta1", 0.9), args["train"].get("beta2", 0.999)), + eps=args["train"].get("eps", 1e-8), + ) + return pufferl + + +def dashboard_args(args, global_step_offset): + args = {**args, "train": {**args["train"]}} + args["train"]["total_timesteps"] = ( + global_step_offset + args["train"]["total_timesteps"] + ) + return args + + +def display_logs( + logs, + global_step_offset, + local_step, + epoch_offset, + run_start_time, + previous_logs=None, +): + logs = flatten_logs(logs) + logs["agent_steps"] = global_step_offset + local_step + logs["epoch"] = epoch_offset + logs.get("epoch", 0) + logs["uptime"] = time.time() - run_start_time + if logs.get("SPS", 0) == 0 and previous_logs is not None: + logs["SPS"] = previous_logs.get("SPS", 0) + return logs + + +def train_stage( + args, + policy, + optimizer, + run_id, + global_step_offset, + stop_threshold, + min_timesteps, + run_start_time, +): + pufferlib.pufferl.validate_config(args) + pufferl = create_pufferl(args, policy, optimizer) + model_size = pufferl.num_params() + flat_logs = {} + stage_logs = [] + best_mastery_perf = -math.inf + best_mastery_step = 0 + best_policy_state = None + best_optimizer_state = None + panel_args = dashboard_args(args, global_step_offset) + batch_size = args["vec"]["total_agents"] * args["train"]["horizon"] + epoch_offset = global_step_offset // batch_size + + try: + while pufferl.global_step < args["train"]["total_timesteps"]: + pufferl.rollouts() + rollout_perf = float(getattr(pufferl, "env_logs", {}).get("perf", math.nan)) + flat_logs = display_logs( + pufferl.log(), + global_step_offset, + pufferl.global_step, + epoch_offset, + run_start_time, + ) + stage_logs.append(flat_logs) + pufferlib.pufferl.print_dashboard(panel_args, model_size, flat_logs) + + if ( + reached_threshold(rollout_perf, stop_threshold) + and rollout_perf > best_mastery_perf + ): + best_mastery_perf = rollout_perf + best_mastery_step = pufferl.global_step + best_policy_state = copy.deepcopy(pufferl.policy.state_dict()) + best_optimizer_state = copy.deepcopy(pufferl.optimizer.state_dict()) + + if reached_mastery( + rollout_perf, + stop_threshold, + pufferl.global_step, + min_timesteps, + ): + rich.print( + f"Early stop: env/perf={rollout_perf:.3f} reached mastery " + f"(threshold={stop_threshold:.3f}, min_timesteps={min_timesteps})" + ) + path = save_checkpoint( + pufferl, + args["checkpoint_dir"], + run_id, + global_step_offset + pufferl.global_step, + ) + return ( + pufferl.policy, + pufferl.optimizer, + pufferl.global_step, + rollout_perf, + path, + stage_logs, + True, + ) + + if best_policy_state is not None and pufferl.global_step >= min_timesteps: + pufferl.policy.load_state_dict(best_policy_state) + pufferl.optimizer.load_state_dict(best_optimizer_state) + rich.print( + f"Restoring best stage policy: env/perf={best_mastery_perf:.3f} " + f"at step={best_mastery_step} after min_timesteps={min_timesteps}" + ) + path = save_checkpoint( + pufferl, + args["checkpoint_dir"], + run_id, + global_step_offset + pufferl.global_step, + ) + return ( + pufferl.policy, + pufferl.optimizer, + pufferl.global_step, + best_mastery_perf, + path, + stage_logs, + True, + ) + + pufferl.train() + + flat_logs = display_logs( + pufferl.log(), + global_step_offset, + pufferl.global_step, + epoch_offset, + run_start_time, + stage_logs[-1] if stage_logs else None, + ) + stage_logs.append(flat_logs) + pufferlib.pufferl.print_dashboard(panel_args, model_size, flat_logs) + perf = float(flat_logs.get("env/perf", math.nan)) + path = save_checkpoint( + pufferl, + args["checkpoint_dir"], + run_id, + global_step_offset + pufferl.global_step, + ) + mastered = reached_mastery( + perf, + stop_threshold, + pufferl.global_step, + min_timesteps, + ) + if not mastered and best_policy_state is not None: + pufferl.policy.load_state_dict(best_policy_state) + pufferl.optimizer.load_state_dict(best_optimizer_state) + perf = best_mastery_perf + mastered = True + path = save_checkpoint( + pufferl, + args["checkpoint_dir"], + run_id, + global_step_offset + pufferl.global_step, + ) + return ( + pufferl.policy, + pufferl.optimizer, + pufferl.global_step, + perf, + path, + stage_logs, + mastered, + ) + finally: + pufferl.close() + + +def main(): + if getattr(_C, "env_name", None) != "clifford" or getattr(_C, "gpu", None) != 0: + raise RuntimeError( + "Build the Clifford CPU backend before running curriculum training" + ) + + profile = curriculum_profile() + n_qubits = env_int("N_QUBITS", 3) + use_shortcut_gates = env_int("USE_SHORTCUT_GATES", 1) + hidden_size = env_int("HIDDEN_SIZE", 128) + run_name = default_run_name( + n_qubits, + hidden_size, + use_shortcut_gates=bool(use_shortcut_gates), + ) + checkpoint_dir = env_str("CHECKPOINT_DIR", os.path.join("checkpoints", run_name)) + log_dir = env_str("LOG_DIR", os.path.join("logs", run_name)) + run_id = str(int(1000 * time.time())) + max_difficulty = env_float("MAX_DIFFICULTY", 64) + curriculum_stride = env_float("CURRICULUM_STRIDE", 0.25) + advance_threshold = env_float("MASTERY_THRESHOLD", 0.95) + final_threshold = env_float("FINAL_MASTERY_THRESHOLD", 0.95) + max_stage_attempts = env_int("MAX_STAGE_ATTEMPTS", 1) + resume = env_int("RESUME", 1) + load_path = latest_checkpoint(checkpoint_dir) if resume else None + run_start_time = time.time() + + policy = None + optimizer = None + global_step_offset = 0 + all_logs = [] + latest_path = None + difficulties = curriculum_difficulties(max_difficulty, curriculum_stride) + success_step_stats = None + keep_max_steps_floor = env_int("KEEP_MAX_STEPS_FLOOR", 0) + max_steps_floor = 0 + + for stage_idx, difficulty in enumerate(difficulties): + max_steps, max_steps_source = stage_max_steps_with_source( + difficulty, + success_step_stats, + ) + if keep_max_steps_floor: + if max_steps_floor > max_steps: + max_steps = max_steps_floor + max_steps_source = f"KEEP_MAX_STEPS_FLOOR={max_steps_floor}" + stop_threshold = ( + final_threshold if stage_idx == len(difficulties) - 1 else advance_threshold + ) + min_timesteps = stage_min_timesteps(stage_idx) + attempt = 1 + + while True: + args = build_args( + difficulty=difficulty, + max_steps=max_steps, + total_timesteps=stage_timesteps(difficulty, stage_idx), + load_model_path=load_path, + ) + load_path = None + + rich.print( + f"\n=== Clifford {n_qubits}q difficulty={format_difficulty(difficulty)} attempt={attempt} " + f"profile={profile} " + f"max_steps={max_steps} budget={args['train']['total_timesteps']} " + f"agents={args['vec']['total_agents']} horizon={args['train']['horizon']} " + f"minibatch={args['train']['minibatch_size']} " + f"lr={args['train']['learning_rate']} ent={args['train']['ent_coef']} " + f"max_steps_source={max_steps_source} " + f"goal_bonus={args['env']['goal_bonus']} " + f"failure_penalty={args['env']['failure_penalty']} " + f"threshold={stop_threshold} " + f"min_timesteps={min_timesteps} ===" + ) + policy, optimizer, stage_steps, perf, latest_path, stage_logs, mastered = ( + train_stage( + args, + policy, + optimizer, + run_id, + global_step_offset, + stop_threshold, + min_timesteps, + run_start_time, + ) + ) + global_step_offset += stage_steps + all_logs.extend(stage_logs) + + rich.print( + f"Stage difficulty={format_difficulty(difficulty)} attempt={attempt} env/perf={perf:.6f} " + f"global_steps={global_step_offset}" + ) + save_metrics(log_dir, run_id, args, all_logs) + + if mastered: + if keep_max_steps_floor: + max_steps_floor = max(max_steps_floor, max_steps) + if stage_logs: + mean_steps = stage_logs[-1].get("env/success_step_mean") + std_steps = stage_logs[-1].get("env/success_step_std") + if mean_steps is not None and std_steps is not None: + success_step_stats = (float(mean_steps), float(std_steps)) + break + + if max_stage_attempts > 0 and attempt >= max_stage_attempts: + criteria = f"env/perf >= {stop_threshold}" + if min_timesteps > 0: + criteria += f" after min_timesteps={min_timesteps}" + criteria += " or env/perf >= 1.0" + raise RuntimeError( + f"Difficulty {format_difficulty(difficulty)} did not meet advancement criteria " + f"({criteria}) within budget={args['train']['total_timesteps']} timesteps " + f"after {attempt} attempts" + ) + + rich.print( + f"Retrying difficulty={format_difficulty(difficulty)}: env/perf={perf:.3f} " + f"< {stop_threshold:.3f}" + ) + attempt += 1 + + rich.print(f"\nDone. Latest checkpoint: {latest_path}") + + +if __name__ == "__main__": + main() diff --git a/scripts/train_clifford_curriculum.sh b/scripts/train_clifford_curriculum.sh new file mode 100755 index 0000000000..e1189f1175 --- /dev/null +++ b/scripts/train_clifford_curriculum.sh @@ -0,0 +1,130 @@ +#!/usr/bin/env bash +set -euo pipefail + +# CPU/PyTorch curriculum run for Clifford synthesis. +# Tune with env vars, e.g.: +# CURRICULUM_PROFILE=auto scripts/train_clifford_curriculum.sh # fast for <=3q, steady for >=4q +# CURRICULUM_PROFILE=fast scripts/train_clifford_curriculum.sh # old 3q defaults, no fixed stage budget +# CURRICULUM_PROFILE=steady N_QUBITS=4 scripts/train_clifford_curriculum.sh # 4q steady recipe +# MAX_DIFFICULTY=12 scripts/train_clifford_curriculum.sh +# TIMESTEPS_PER_STAGE=4000000 scripts/train_clifford_curriculum.sh +# FIRST_STAGE_TIMESTEPS=2000000 TIMESTEPS_PER_STAGE=500000 scripts/train_clifford_curriculum.sh +# MASTERY_THRESHOLD=0.95 scripts/train_clifford_curriculum.sh +# ENT_COEF=0.05 GOAL_BONUS=1.0 scripts/train_clifford_curriculum.sh +# OPTIMIZER=muon scripts/train_clifford_curriculum.sh +# MAX_STAGE_ATTEMPTS=3 scripts/train_clifford_curriculum.sh +# MIN_MAX_STEPS=4 MAX_STEPS_BASE_SLACK=2 scripts/train_clifford_curriculum.sh +# MAX_STEPS_STDDEVS=5 MAX_STEPS_STDDEVS_AFTER_DIFFICULTY=32 MAX_STEPS_SLACK=64 MAX_STEPS_SLACK_AFTER_DIFFICULTY=32 scripts/train_clifford_curriculum.sh +# N_QUBITS=4 scripts/train_clifford_curriculum.sh +# FORCE_BUILD=1 scripts/train_clifford_curriculum.sh + +if [[ -x /opt/homebrew/opt/llvm/bin/clang && -z "${CC:-}" ]]; then + export CC=/opt/homebrew/opt/llvm/bin/clang +fi +if [[ -x /opt/homebrew/opt/llvm/bin/clang++ && -z "${CXX:-}" ]]; then + export CXX=/opt/homebrew/opt/llvm/bin/clang++ +fi + +USE_SHORTCUT_GATES="${USE_SHORTCUT_GATES:-1}" +N_QUBITS="${N_QUBITS:-3}" +export EXTRA_CFLAGS="${EXTRA_CFLAGS:-"-DCLIFFORD_N_QUBITS=$N_QUBITS -DCLIFFORD_USE_SHORTCUT_GATES=$USE_SHORTCUT_GATES"}" +FORCE_BUILD="${FORCE_BUILD:-0}" + +has_correct_build() { + python - <<'PY' +import sys +import ctypes +import os + +import numpy as np + +try: + from pufferlib import _C +except Exception: + raise SystemExit(1) + +if getattr(_C, "env_name", None) != "clifford" or getattr(_C, "gpu", None) != 0: + raise SystemExit(1) + +use_shortcut_gates = int(os.environ.get("USE_SHORTCUT_GATES", "1")) +n_qubits = int(os.environ.get("N_QUBITS", "3")) +expected_obs_size = (2 * n_qubits) ** 2 +expected_actions = (5 if use_shortcut_gates else 2) * n_qubits + n_qubits * (n_qubits - 1) // 2 +args = { + "vec": { + "total_agents": 1, + "num_buffers": 1, + }, + "env": { + "n_qubits": n_qubits, + "difficulty": 0, + "max_steps": 1, + "single_qubit_cost": 0.001, + "cz_cost": 0.1, + "goal_bonus": 1.0, + "failure_penalty": -1.0, + "use_shortcut_gates": use_shortcut_gates, + "seed": 0, + }, +} + +try: + vec = _C.create_vec(args, 0) +except Exception: + raise SystemExit(1) + +try: + if vec.obs_size != expected_obs_size or vec.act_sizes != [expected_actions] or vec.num_atns != 1: + raise SystemExit(1) + actions = np.zeros((1, 1), dtype=np.float32) + vec.cpu_step(actions.ctypes.data) + rewards = np.ctypeslib.as_array((ctypes.c_float * 1).from_address(vec.rewards_ptr)) + if rewards[0] > -1.0: + raise SystemExit(1) + vec.cpu_step(actions.ctypes.data) + log = vec.log() + if "difficulty" not in log or "max_steps" not in log: + raise SystemExit(1) +finally: + vec.close() +PY +} + +if [[ "$FORCE_BUILD" != "1" ]] && has_correct_build >/dev/null 2>&1; then + echo "Using existing ${N_QUBITS}-qubit Clifford CPU backend." +else + echo "Building ${N_QUBITS}-qubit Clifford CPU backend..." + bash build.sh clifford --cpu +fi + +fix_macos_openmp() { + [[ "$(uname -s)" == "Darwin" ]] || return 0 + + local ext_suffix output torch_libomp linked_libomp + ext_suffix="$(python -c "import sysconfig; print(sysconfig.get_config_var('EXT_SUFFIX'))")" + output="pufferlib/_C${ext_suffix}" + [[ -f "$output" ]] || return 0 + + torch_libomp="$( + python - <<'PY' +import os +try: + import torch + path = os.path.join(os.path.dirname(torch.__file__), "lib", "libomp.dylib") + print(path if os.path.exists(path) else "") +except Exception: + print("") +PY + )" + [[ -n "$torch_libomp" ]] || return 0 + + linked_libomp="$(otool -L "$output" | awk '/libomp\.dylib/{print $1; exit}')" + if [[ -n "$linked_libomp" && "$linked_libomp" != "$torch_libomp" ]]; then + echo "Pointing $output at PyTorch libomp to avoid duplicate OpenMP runtimes..." + install_name_tool -change "$linked_libomp" "$torch_libomp" "$output" + fi +} + +fix_macos_openmp + +exec python scripts/train_clifford_curriculum.py "$@" diff --git a/tests/clifford/__init__.py b/tests/clifford/__init__.py new file mode 100644 index 0000000000..c5163d2288 --- /dev/null +++ b/tests/clifford/__init__.py @@ -0,0 +1 @@ +"""Test helpers for the Clifford ocean environment.""" diff --git a/tests/test_ocean_clifford.py b/tests/test_ocean_clifford.py new file mode 100644 index 0000000000..6418e26298 --- /dev/null +++ b/tests/test_ocean_clifford.py @@ -0,0 +1,302 @@ +import ctypes + +import numpy as np +import pytest + +try: + from pufferlib import _C +except ImportError: + _C = None + +BINDING_AVAILABLE = _C is not None and getattr(_C, "env_name", None) == "clifford" +DEFAULT_N_QUBITS = 6 + + +def _identity_symplectic(n_qubits=DEFAULT_N_QUBITS): + return np.eye(2 * n_qubits, dtype=np.uint8) + + +def _build_actions(n_qubits=DEFAULT_N_QUBITS, use_shortcut_gates=True): + actions = [] + single_qubit_gates = ("h", "s") + if use_shortcut_gates: + single_qubit_gates = single_qubit_gates + ("v", "hs", "hv") + for gate in single_qubit_gates: + for qubit in range(n_qubits): + actions.append((gate, qubit, -1)) + for src in range(n_qubits): + for dst in range(src + 1, n_qubits): + actions.append(("cz", src, dst)) + return actions + + +def _find_action(actions, gate_name, q0, q1=-1): + for idx, action in enumerate(actions): + if action == (gate_name, q0, q1): + return idx + raise AssertionError(f"Action {(gate_name, q0, q1)} not found") + + +def _expected_obs_after_identity_gate(n_qubits, gate_name, q0, q1=-1): + matrix = _identity_symplectic(n_qubits) + x_col = matrix[:, q0].copy() + z_col = matrix[:, n_qubits + q0].copy() + if gate_name == "h": + matrix[:, q0] = z_col + matrix[:, n_qubits + q0] = x_col + elif gate_name == "s": + matrix[:, n_qubits + q0] ^= x_col + elif gate_name == "v": + matrix[:, q0] = x_col ^ z_col + elif gate_name == "hs": + matrix[:, q0] = z_col + matrix[:, n_qubits + q0] = x_col ^ z_col + elif gate_name == "hv": + matrix[:, q0] = x_col ^ z_col + matrix[:, n_qubits + q0] = x_col + elif gate_name == "cz": + q1_x_col = matrix[:, q1].copy() + matrix[:, n_qubits + q0] ^= q1_x_col + matrix[:, n_qubits + q1] ^= x_col + else: + raise AssertionError(f"Unknown gate {gate_name}") + return matrix.reshape(-1) + + +def _obs_array(vec): + raw = (ctypes.c_uint8 * (vec.total_agents * vec.obs_size)).from_address(vec.obs_ptr) + return np.ctypeslib.as_array(raw).reshape(vec.total_agents, vec.obs_size) + + +def _float_array(ptr, length): + raw = (ctypes.c_float * length).from_address(ptr) + return np.ctypeslib.as_array(raw) + + +def _make_args( + num_envs=1, + n_qubits=None, + difficulty=0, + max_steps=8, + goal_bonus=0.0, + failure_penalty=-1.0, + single_qubit_cost=0.001, + cz_cost=0.1, + use_shortcut_gates=1, + seed=0, +): + env_args = { + "difficulty": difficulty, + "max_steps": max_steps, + "single_qubit_cost": single_qubit_cost, + "cz_cost": cz_cost, + "goal_bonus": goal_bonus, + "failure_penalty": failure_penalty, + "use_shortcut_gates": use_shortcut_gates, + "seed": seed, + } + if n_qubits is not None: + env_args["n_qubits"] = n_qubits + + return { + "vec": { + "total_agents": num_envs, + "num_buffers": 1, + }, + "env": env_args, + } + + +def _make_vec(**kwargs): + vec = _C.create_vec(_make_args(**kwargs), 0) + vec.reset() + return vec + + +def _step(vec, action_idx, num_envs=1): + actions = np.full((num_envs, 1), float(action_idx), dtype=np.float32) + vec.cpu_step(actions.ctypes.data) + obs = _obs_array(vec).copy() + rewards = _float_array(vec.rewards_ptr, num_envs).copy() + terminals = _float_array(vec.terminals_ptr, num_envs).copy() + return obs, rewards, terminals + + +def _n_qubits_from_vec(vec): + dim = int(round(vec.obs_size**0.5)) + assert dim * dim == vec.obs_size + assert dim % 2 == 0 + return dim // 2 + + +def test_build_actions_full_connectivity_order(): + actions = _build_actions(3) + assert len(actions) == 18 + assert actions[:5] == [ + ("h", 0, -1), + ("h", 1, -1), + ("h", 2, -1), + ("s", 0, -1), + ("s", 1, -1), + ] + assert actions[-3:] == [ + ("cz", 0, 1), + ("cz", 0, 2), + ("cz", 1, 2), + ] + assert actions[6:9] == [ + ("v", 0, -1), + ("v", 1, -1), + ("v", 2, -1), + ] + assert len(_build_actions(3, use_shortcut_gates=False)) == 9 + + +@pytest.mark.skipif( + not BINDING_AVAILABLE, reason="native clifford binding is not built" +) +def test_native_vec_shapes_and_dtypes(): + vec = _make_vec(num_envs=4, difficulty=0) + try: + n_qubits = _n_qubits_from_vec(vec) + obs_size = (2 * n_qubits) ** 2 + num_actions = 5 * n_qubits + n_qubits * (n_qubits - 1) // 2 + assert vec.total_agents == 4 + assert vec.obs_size == obs_size + assert vec.num_atns == 1 + assert vec.act_sizes == [num_actions] + assert vec.obs_dtype == "ByteTensor" + obs = _obs_array(vec) + assert obs.shape == (4, obs_size) + assert obs.dtype == np.uint8 + np.testing.assert_array_equal( + obs[0], _identity_symplectic(n_qubits).reshape(-1) + ) + finally: + vec.close() + + +@pytest.mark.skipif( + not BINDING_AVAILABLE, reason="native clifford binding is not built" +) +@pytest.mark.parametrize( + "gate_name", + [ + "h", + "s", + "v", + "hs", + "hv", + "cz", + ], +) +def test_native_applies_known_gate_from_identity(gate_name): + vec = _make_vec(difficulty=0, max_steps=10) + try: + n_qubits = _n_qubits_from_vec(vec) + actions = _build_actions(n_qubits) + if gate_name == "cz": + if n_qubits < 2: + pytest.skip("CZ requires at least two compiled qubits") + action = (gate_name, 0, 1) + else: + action = (gate_name, 0, -1) + action_idx = _find_action(actions, *action) + expected = _expected_obs_after_identity_gate(n_qubits, *action) + obs, rewards, terminals = _step(vec, action_idx) + np.testing.assert_array_equal(obs[0], expected) + assert rewards[0] == pytest.approx( + -0.1 if gate_name == "cz" else -0.001, abs=1e-6 + ) + assert not bool(terminals[0]) + finally: + vec.close() + + +@pytest.mark.skipif( + not BINDING_AVAILABLE, reason="native clifford binding is not built" +) +def test_native_terminal_step_auto_resets_to_identity(): + vec = _make_vec(difficulty=0, max_steps=8, goal_bonus=5.0) + try: + n_qubits = _n_qubits_from_vec(vec) + action_idx = _find_action(_build_actions(n_qubits), "h", 0) + _step(vec, action_idx) + obs, rewards, terminals = _step(vec, action_idx) + assert rewards[0] == pytest.approx(4.999, abs=1e-6) + assert bool(terminals[0]) + np.testing.assert_array_equal( + obs[0], _identity_symplectic(n_qubits).reshape(-1) + ) + finally: + vec.close() + + +@pytest.mark.skipif( + not BINDING_AVAILABLE, reason="native clifford binding is not built" +) +def test_native_log_reports_completed_episodes(): + vec = _make_vec(difficulty=0, max_steps=8, goal_bonus=5.0) + try: + n_qubits = _n_qubits_from_vec(vec) + action_idx = _find_action(_build_actions(n_qubits), "h", 0) + _step(vec, action_idx) + _step(vec, action_idx) + log = vec.log() + assert log["n"] == pytest.approx(1.0) + assert log["score"] == pytest.approx(4.998, abs=1e-6) + assert log["success_rate"] == pytest.approx(1.0) + assert log["difficulty"] == pytest.approx(0.0) + assert log["max_steps"] == pytest.approx(8.0) + assert log["episode_length"] == pytest.approx(2.0) + finally: + vec.close() + + +@pytest.mark.skipif( + not BINDING_AVAILABLE, reason="native clifford binding is not built" +) +def test_native_cz_cost_is_configurable(): + vec = _make_vec(difficulty=0, max_steps=8, cz_cost=0.05) + try: + n_qubits = _n_qubits_from_vec(vec) + if n_qubits < 2: + pytest.skip("CZ requires at least two compiled qubits") + action_idx = _find_action(_build_actions(n_qubits), "cz", 0, 1) + _obs, rewards, terminals = _step(vec, action_idx) + assert rewards[0] == pytest.approx(-0.05, abs=1e-6) + assert not bool(terminals[0]) + finally: + vec.close() + + +@pytest.mark.skipif( + not BINDING_AVAILABLE, reason="native clifford binding is not built" +) +def test_native_failure_penalty_is_configurable(): + vec = _make_vec(difficulty=0, max_steps=1, failure_penalty=-0.25) + try: + n_qubits = _n_qubits_from_vec(vec) + action_idx = _find_action(_build_actions(n_qubits), "h", 0) + _obs, rewards, terminals = _step(vec, action_idx) + assert rewards[0] == pytest.approx(-0.251, abs=1e-6) + assert bool(terminals[0]) + finally: + vec.close() + + +@pytest.mark.skipif( + not BINDING_AVAILABLE, reason="native clifford binding is not built" +) +def test_native_fractional_difficulty_is_reported(): + vec = _make_vec(num_envs=8, difficulty=1.25, max_steps=1) + try: + action_idx = _find_action(_build_actions(_n_qubits_from_vec(vec)), "h", 0) + for _ in range(4): + _step(vec, action_idx, num_envs=8) + log = vec.log() + assert log["n"] > 0 + assert log["difficulty"] == pytest.approx(1.25) + assert log["max_steps"] == pytest.approx(1.0) + finally: + vec.close()