diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 1b32e1a1a..1dde6c726 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -237,6 +237,7 @@ jobs: # - "No_Position_Experiment" - "Othello_GPT" - "Patchscopes_Generation_Demo" + - "Qwen" # - "T5" steps: - uses: actions/checkout@v3 diff --git a/demos/Qwen.ipynb b/demos/Qwen.ipynb index 96732abe8..1f4eb2c94 100644 --- a/demos/Qwen.ipynb +++ b/demos/Qwen.ipynb @@ -9,75 +9,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "Requirement already satisfied: transformers_stream_generator in /root/TransformerLens/.venv/lib/python3.10/site-packages (0.0.4)\n", - "Requirement already satisfied: plotly in /root/TransformerLens/.venv/lib/python3.10/site-packages (5.18.0)\n", - "Requirement already satisfied: circuitsvis in /root/TransformerLens/.venv/lib/python3.10/site-packages (1.43.2)\n", - "Requirement already satisfied: huggingface_hub in /root/TransformerLens/.venv/lib/python3.10/site-packages (0.20.2)\n", - "Requirement already satisfied: einops in /root/TransformerLens/.venv/lib/python3.10/site-packages (0.7.0)\n", - "Requirement already satisfied: tiktoken in /root/TransformerLens/.venv/lib/python3.10/site-packages (0.5.2)\n", - "Requirement already satisfied: datasets in /root/TransformerLens/.venv/lib/python3.10/site-packages (2.14.4)\n", - "Requirement already satisfied: transformers>=4.26.1 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from transformers_stream_generator) (4.37.2)\n", - "Requirement already satisfied: tenacity>=6.2.0 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from plotly) (8.2.3)\n", - "Requirement already satisfied: packaging in /root/TransformerLens/.venv/lib/python3.10/site-packages (from plotly) (23.2)\n", - "Requirement already satisfied: importlib-metadata>=5.1.0 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from circuitsvis) (7.0.1)\n", - "Requirement already satisfied: numpy>=1.24 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from circuitsvis) (1.26.3)\n", - "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from circuitsvis) (12.1.3.1)\n", - "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from circuitsvis) (12.1.105)\n", - "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from circuitsvis) (12.1.105)\n", - "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from circuitsvis) (12.1.105)\n", - "Requirement already satisfied: nvidia-cudnn-cu12==8.9.2.26 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from circuitsvis) (8.9.2.26)\n", - "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from circuitsvis) (11.0.2.54)\n", - "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from circuitsvis) (10.3.2.106)\n", - "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from circuitsvis) (11.4.5.107)\n", - "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from circuitsvis) (12.1.0.106)\n", - "Requirement already satisfied: nvidia-nccl-cu12==2.18.1 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from circuitsvis) (2.18.1)\n", - "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from circuitsvis) (12.1.105)\n", - "Requirement already satisfied: torch>=1.10 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from circuitsvis) (2.1.2)\n", - "Requirement already satisfied: triton==2.1.0 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from circuitsvis) (2.1.0)\n", - "Requirement already satisfied: nvidia-nvjitlink-cu12 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from nvidia-cusolver-cu12==11.4.5.107->circuitsvis) (12.3.101)\n", - "Requirement already satisfied: filelock in /root/TransformerLens/.venv/lib/python3.10/site-packages (from triton==2.1.0->circuitsvis) (3.13.1)\n", - "Requirement already satisfied: fsspec>=2023.5.0 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from huggingface_hub) (2023.12.2)\n", - "Requirement already satisfied: requests in /root/TransformerLens/.venv/lib/python3.10/site-packages (from huggingface_hub) (2.31.0)\n", - "Requirement already satisfied: tqdm>=4.42.1 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from huggingface_hub) (4.66.1)\n", - "Requirement already satisfied: pyyaml>=5.1 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from huggingface_hub) (6.0.1)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from huggingface_hub) (4.9.0)\n", - "Requirement already satisfied: regex>=2022.1.18 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from tiktoken) (2023.12.25)\n", - "Requirement already satisfied: pyarrow>=8.0.0 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from datasets) (14.0.2)\n", - "Requirement already satisfied: dill<0.3.8,>=0.3.0 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from datasets) (0.3.7)\n", - "Requirement already satisfied: pandas in /root/TransformerLens/.venv/lib/python3.10/site-packages (from datasets) (2.0.3)\n", - "Requirement already satisfied: xxhash in /root/TransformerLens/.venv/lib/python3.10/site-packages (from datasets) (3.4.1)\n", - "Requirement already satisfied: multiprocess in /root/TransformerLens/.venv/lib/python3.10/site-packages (from datasets) (0.70.15)\n", - "Requirement already satisfied: aiohttp in /root/TransformerLens/.venv/lib/python3.10/site-packages (from datasets) (3.9.1)\n", - "Requirement already satisfied: attrs>=17.3.0 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from aiohttp->datasets) (23.2.0)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from aiohttp->datasets) (6.0.4)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from aiohttp->datasets) (1.9.4)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from aiohttp->datasets) (1.4.1)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from aiohttp->datasets) (1.3.1)\n", - "Requirement already satisfied: async-timeout<5.0,>=4.0 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from aiohttp->datasets) (4.0.3)\n", - "Requirement already satisfied: zipp>=0.5 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from importlib-metadata>=5.1.0->circuitsvis) (3.17.0)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from requests->huggingface_hub) (3.3.2)\n", - "Requirement already satisfied: idna<4,>=2.5 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from requests->huggingface_hub) (3.6)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from requests->huggingface_hub) (2.1.0)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from requests->huggingface_hub) (2023.11.17)\n", - "Requirement already satisfied: sympy in /root/TransformerLens/.venv/lib/python3.10/site-packages (from torch>=1.10->circuitsvis) (1.12)\n", - "Requirement already satisfied: networkx in /root/TransformerLens/.venv/lib/python3.10/site-packages (from torch>=1.10->circuitsvis) (3.1)\n", - "Requirement already satisfied: jinja2 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from torch>=1.10->circuitsvis) (3.1.2)\n", - "Requirement already satisfied: tokenizers<0.19,>=0.14 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from transformers>=4.26.1->transformers_stream_generator) (0.15.0)\n", - "Requirement already satisfied: safetensors>=0.4.1 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from transformers>=4.26.1->transformers_stream_generator) (0.4.1)\n", - "Requirement already satisfied: python-dateutil>=2.8.2 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from pandas->datasets) (2.8.2)\n", - "Requirement already satisfied: pytz>=2020.1 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from pandas->datasets) (2023.3.post1)\n", - "Requirement already satisfied: tzdata>=2022.1 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from pandas->datasets) (2023.4)\n", - "Requirement already satisfied: six>=1.5 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from jinja2->torch>=1.10->circuitsvis) (2.1.3)\n", - "Requirement already satisfied: mpmath>=0.19 in /root/TransformerLens/.venv/lib/python3.10/site-packages (from sympy->torch>=1.10->circuitsvis) (1.3.0)\n", - "\n", - "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m23.3.1\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m24.0\u001B[0m\n", - "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\n", + "/Users/jlarson/Documents/PROJECTS/TransformerLens/.venv/bin/python: No module named pip\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ + "# NBVAL_IGNORE_OUTPUT\n", "%pip install transformers_stream_generator plotly circuitsvis huggingface_hub einops tiktoken datasets" ] }, @@ -92,19 +30,10 @@ "text": [ "Running as a Jupyter notebook - intended for development only!\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_13850/410710250.py:21: DeprecationWarning: `magic(...)` is deprecated since IPython 0.13 (warning added in 8.1), use run_line_magic(magic_name, parameter_s).\n", - " ipython.magic(\"load_ext autoreload\")\n", - "/tmp/ipykernel_13850/410710250.py:22: DeprecationWarning: `magic(...)` is deprecated since IPython 0.13 (warning added in 8.1), use run_line_magic(magic_name, parameter_s).\n", - " ipython.magic(\"autoreload 2\")\n" - ] } ], "source": [ + "# NBVAL_IGNORE_OUTPUT\n", "# Janky code to do different setup when run in a Colab notebook vs VSCode\n", "DEVELOPMENT_MODE = False\n", "try:\n", @@ -156,23 +85,14 @@ "cell_type": "code", "execution_count": 4, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/root/TransformerLens\n" - ] - } - ], + "outputs": [], "source": [ - "%cd ~/TransformerLens\n", - "import torch\n", "# NBVAL_IGNORE_OUTPUT\n", + "import torch\n", "_ = torch.set_grad_enabled(False)\n", "\n", "from transformers import AutoTokenizer\n", - "from transformer_lens import HookedTransformer\n", + "from transformer_lens.model_bridge import TransformerBridge\n", "from transformers import AutoModelForCausalLM, AutoTokenizer\n", "from transformers.generation import GenerationConfig\n", "\n", @@ -195,7 +115,8 @@ " prompt_toks = tokenizer(prompt, return_tensors=\"pt\").input_ids\n", "\n", " hf_logits = hf_model(prompt_toks.to(hf_model.device)).logits\n", - " tl_logits = tl_model(prompt_toks).to(hf_logits)\n", + " tl_device = next(tl_model.parameters()).device\n", + " tl_logits = tl_model(prompt_toks.to(tl_device)).to(hf_logits)\n", "\n", " assert torch.allclose(torch.softmax(hf_logits, dim=-1), torch.softmax(tl_logits, dim=-1), atol=atol)" ] @@ -204,7 +125,55 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Qwen, first generation" + "## Qwen, first generation\n", + "\n", + "> **Note:** Qwen gen1 (`Qwen/Qwen-1_8B-Chat`) requires `trust_remote_code=True` and depends on\n", + "> `transformers_stream_generator`, which is incompatible with `transformers>=5.0.0`\n", + "> (`DisjunctiveConstraint` was removed). This section is commented out until the upstream\n", + "> dependency is updated." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# NBVAL_SKIP\n", + "# model_path = \"Qwen/Qwen-1_8B-Chat\"\n", + "# device = \"cuda\" if torch.cuda.is_available() else \"mps\" if torch.backends.mps.is_available() else \"cpu\"\n", + "# \n", + "# tokenizer = AutoTokenizer.from_pretrained(\n", + "# model_path,\n", + "# trust_remote_code=True\n", + "# )\n", + "# \n", + "# hf_model = AutoModelForCausalLM.from_pretrained(\n", + "# model_path,\n", + "# device_map=device,\n", + "# torch_dtype=torch.float32,\n", + "# use_logn_attn=False,\n", + "# use_dynamic_ntk=False,\n", + "# scale_attn_weights=False,\n", + "# trust_remote_code=True\n", + "# ).eval()\n", + "# \n", + "# tl_model = TransformerBridge.boot_transformers(\n", + "# model_path,\n", + "# device=device,\n", + "# dtype=torch.float32,\n", + "# trust_remote_code=True,\n", + "# )\n", + "# tl_model.enable_compatibility_mode()\n", + "# \n", + "# assert_hf_and_tl_model_are_close(hf_model, tl_model, tokenizer)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Qwen2" ] }, { @@ -216,131 +185,72 @@ "name": "stderr", "output_type": "stream", "text": [ - "Your device support faster inference by passing bf16=True in \"AutoModelForCausalLM.from_pretrained\".\n" + "Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "2cffaf8715b64623b6799822d7cf1cfe", + "model_id": "00c489dffa18461f932645808546a319", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Loading checkpoint shards: 0%| | 0/2 [00:00 None: "blocks.{i}.attn.k.weight": ParamProcessingConversion( tensor_conversion=RearrangeTensorConversion( "(n h) m -> n m h", - n=getattr(self.cfg, "n_key_value_heads", self.cfg.n_heads), + n=getattr(self.cfg, "n_key_value_heads", None) or self.cfg.n_heads, ), ), "blocks.{i}.attn.v.weight": ParamProcessingConversion( tensor_conversion=RearrangeTensorConversion( "(n h) m -> n m h", - n=getattr(self.cfg, "n_key_value_heads", self.cfg.n_heads), + n=getattr(self.cfg, "n_key_value_heads", None) or self.cfg.n_heads, ), ), "blocks.{i}.attn.o.weight": ParamProcessingConversion( diff --git a/transformer_lens/model_bridge/supported_architectures/gemma3_multimodal.py b/transformer_lens/model_bridge/supported_architectures/gemma3_multimodal.py index ca9210896..6c9e71941 100644 --- a/transformer_lens/model_bridge/supported_architectures/gemma3_multimodal.py +++ b/transformer_lens/model_bridge/supported_architectures/gemma3_multimodal.py @@ -80,13 +80,13 @@ def __init__(self, cfg: Any) -> None: "blocks.{i}.attn.k.weight": ParamProcessingConversion( tensor_conversion=RearrangeTensorConversion( "(n h) m -> n m h", - n=getattr(self.cfg, "n_key_value_heads", self.cfg.n_heads), + n=getattr(self.cfg, "n_key_value_heads", None) or self.cfg.n_heads, ), ), "blocks.{i}.attn.v.weight": ParamProcessingConversion( tensor_conversion=RearrangeTensorConversion( "(n h) m -> n m h", - n=getattr(self.cfg, "n_key_value_heads", self.cfg.n_heads), + n=getattr(self.cfg, "n_key_value_heads", None) or self.cfg.n_heads, ), ), "blocks.{i}.attn.o.weight": ParamProcessingConversion( diff --git a/transformer_lens/model_bridge/supported_architectures/llama.py b/transformer_lens/model_bridge/supported_architectures/llama.py index 7c9ae0a72..acba3bcab 100644 --- a/transformer_lens/model_bridge/supported_architectures/llama.py +++ b/transformer_lens/model_bridge/supported_architectures/llama.py @@ -77,13 +77,13 @@ def __init__(self, cfg: Any) -> None: "blocks.{i}.attn.k.weight": ParamProcessingConversion( tensor_conversion=RearrangeTensorConversion( "(n h) m -> n m h", - n=getattr(self.cfg, "n_key_value_heads", self.cfg.n_heads), + n=getattr(self.cfg, "n_key_value_heads", None) or self.cfg.n_heads, ), ), "blocks.{i}.attn.v.weight": ParamProcessingConversion( tensor_conversion=RearrangeTensorConversion( "(n h) m -> n m h", - n=getattr(self.cfg, "n_key_value_heads", self.cfg.n_heads), + n=getattr(self.cfg, "n_key_value_heads", None) or self.cfg.n_heads, ), ), "blocks.{i}.attn.o.weight": ParamProcessingConversion( diff --git a/transformer_lens/model_bridge/supported_architectures/qwen2.py b/transformer_lens/model_bridge/supported_architectures/qwen2.py index 8a905e7c0..4a1b37153 100644 --- a/transformer_lens/model_bridge/supported_architectures/qwen2.py +++ b/transformer_lens/model_bridge/supported_architectures/qwen2.py @@ -62,13 +62,13 @@ def __init__(self, cfg: Any) -> None: "blocks.{i}.attn.k.weight": ParamProcessingConversion( tensor_conversion=RearrangeTensorConversion( "(n h) m -> n m h", - n=getattr(self.cfg, "n_key_value_heads", self.cfg.n_heads), + n=getattr(self.cfg, "n_key_value_heads", None) or self.cfg.n_heads, ), ), "blocks.{i}.attn.v.weight": ParamProcessingConversion( tensor_conversion=RearrangeTensorConversion( "(n h) m -> n m h", - n=getattr(self.cfg, "n_key_value_heads", self.cfg.n_heads), + n=getattr(self.cfg, "n_key_value_heads", None) or self.cfg.n_heads, ), ), "blocks.{i}.attn.o.weight": ParamProcessingConversion( diff --git a/transformer_lens/model_bridge/supported_architectures/qwen3.py b/transformer_lens/model_bridge/supported_architectures/qwen3.py index 31a871b3c..55da0bc03 100644 --- a/transformer_lens/model_bridge/supported_architectures/qwen3.py +++ b/transformer_lens/model_bridge/supported_architectures/qwen3.py @@ -63,13 +63,13 @@ def __init__(self, cfg: Any) -> None: "blocks.{i}.attn.k.weight": ParamProcessingConversion( tensor_conversion=RearrangeTensorConversion( "(n h) m -> n m h", - n=getattr(self.cfg, "n_key_value_heads", self.cfg.n_heads), + n=getattr(self.cfg, "n_key_value_heads", None) or self.cfg.n_heads, ), ), "blocks.{i}.attn.v.weight": ParamProcessingConversion( tensor_conversion=RearrangeTensorConversion( "(n h) m -> n m h", - n=getattr(self.cfg, "n_key_value_heads", self.cfg.n_heads), + n=getattr(self.cfg, "n_key_value_heads", None) or self.cfg.n_heads, ), ), "blocks.{i}.attn.o.weight": ParamProcessingConversion(