diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml deleted file mode 100644 index 76f6ad3..0000000 --- a/.devcontainer/docker-compose.yml +++ /dev/null @@ -1,29 +0,0 @@ -version: "3.9" - -services: - r-dev: - build: - context: .. - dockerfile: .devcontainer/r/Dockerfile - volumes: - - ..:/workspaces/project:cached - - renv_cache:/opt/renv/cache - - ${SECURE_DATA_DIR:-}:/secure-data:ro - working_dir: /workspaces/project - command: sleep infinity - - py-dev: - build: - context: .. - dockerfile: .devcontainer/py/Dockerfile - volumes: - - ..:/workspaces/project:cached - - ${SECURE_DATA_DIR:-}:/secure-data:ro - working_dir: /workspaces/project - command: sleep infinity - -volumes: - renv_cache: - poetry_cache: - pip_cache: - diff --git a/.devcontainer/r/Dockerfile b/.devcontainer/r/Dockerfile deleted file mode 100644 index 6c2368b..0000000 --- a/.devcontainer/r/Dockerfile +++ /dev/null @@ -1,48 +0,0 @@ -# Pin R version here (adjust as needed) -FROM rocker/r-ver:4.4.2 - -ENV DEBIAN_FRONTEND=noninteractive - -# System deps commonly needed for Bioconductor + ggplot + compilation -RUN apt-get update && apt-get install -y --no-install-recommends \ - git \ - curl \ - wget \ - ca-certificates \ - locales \ - build-essential \ - pkg-config \ - libcurl4-openssl-dev \ - libssl-dev \ - libxml2-dev \ - libfontconfig1-dev \ - libfreetype6-dev \ - libpng-dev \ - libtiff5-dev \ - libjpeg-dev \ - libcairo2-dev \ - libharfbuzz-dev \ - libfribidi-dev \ - libglpk-dev \ - && rm -rf /var/lib/apt/lists/* - -# Locale (helps with some R packages / rendering) -RUN sed -i 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \ - locale-gen -ENV LANG=en_US.UTF-8 -ENV LC_ALL=en_US.UTF-8 - -# Install renv in the system library (project will still use renv.lock) -RUN R -q -e "install.packages('renv', repos='https://cloud.r-project.org')" - -# --- Quarto (pin version for reproducible rendering) --- -ENV QUARTO_VERSION=1.5.57 -RUN wget -q "https://github.com/quarto-dev/quarto-cli/releases/download/v${QUARTO_VERSION}/quarto-${QUARTO_VERSION}-linux-amd64.deb" \ - && dpkg -i "quarto-${QUARTO_VERSION}-linux-amd64.deb" \ - && rm -f "quarto-${QUARTO_VERSION}-linux-amd64.deb" - -# Optional: speed up / standardize renv cache location (persisted via volume) -ENV RENV_PATHS_CACHE=/opt/renv/cache - -WORKDIR /workspaces/project - diff --git a/.devcontainer/r/devcontainer.json b/.devcontainer/r/devcontainer.json deleted file mode 100644 index d28daad..0000000 --- a/.devcontainer/r/devcontainer.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "name": "project (R + renv)", - "dockerComposeFile": ["../docker-compose.yml"], - "service": "r-dev", - "workspaceFolder": "/workspaces/project", - "shutdownAction": "stopCompose", - - "customizations": { - "vscode": { - "extensions": [ - "REditorSupport.r", - "rdebugger.r-debugger", - "Ikuyadeu.r", - "ms-azuretools.vscode-docker", - "GitHub.copilot" - ], - "settings": { - "r.rterm.linux": "/usr/bin/R", - "r.bracketedPaste": true - } - } - }, - - "postCreateCommand": "bash -lc 'quarto --version && if [ -f renv.lock ]; then R -q -e \"renv::restore(prompt = FALSE)\"; else echo \"No renv.lock yet. Run: R -q -e \\\"renv::init()\\\"\"; fi'" -} - diff --git a/.gitignore b/.gitignore index b2f88fe..0d024df 100644 --- a/.gitignore +++ b/.gitignore @@ -8,11 +8,8 @@ __pycache__/ .pytest_cache/ .mypy_cache/ .ruff_cache/ - -# R -.Rhistory -.RData -.Rproj.user/ +dist/ +*.egg-info/ # Outputs results/ @@ -22,12 +19,6 @@ data/**/processed/ # OS .DS_Store -# Quarto -.quarto/ -_quarto/ -_site/ -_freeze/ - .devcontainer/.env .env config/local.env diff --git a/README.md b/README.md index d30eb23..6babcea 100644 --- a/README.md +++ b/README.md @@ -1,28 +1,398 @@ -# CompBio template: R (renv) + Python (Poetry) in separate Dev Containers +# Omicslog -## Before to start -1. Modify the `.devcontainer/.env` file with the path for your raw data. +## Importing packages -## Quick start (VS Code) -1. Open this repository in VS Code. -2. Command Palette → **Dev Containers: Reopen in Container** -3. Choose either: - - **project (R + renv)** for Bioconductor/ggplot work - - **project (Python + Poetry)** for ML/AI work -Both containers mount the same repository, so `data/` and `results/` are shared. +```python +from omicslog import log_start +import numpy as np +import pandas as pd +import anndata as ad +from scipy.sparse import csr_matrix -## R workflow (inside R container) -- Initialize renv (first time): `make r-init` -- Restore: `make r-restore` -- Snapshot: `make r-snapshot` -- Smoke test: `make r-check` +``` -## Python workflow (inside Python container) -- Install deps: `make py-install` -- Lock: `make py-lock` -- Smoke test: `make py-check` +
+📝 Note
+The AnnData object were generated using code from the original AnnData documentation. +
+ + +```python +counts = csr_matrix(np.random.poisson(1, size=(100, 2000)), dtype=np.float32) +adata = ad.AnnData(counts) + +adata.obs_names = [f"Cell_{i:d}" for i in range(adata.n_obs)] +adata.var_names = [f"Gene_{i:d}" for i in range(adata.n_vars)] + +logdata = log_start(adata) +print(logdata) + +ct = np.random.choice(["B", "T", "Monocyte"], size=(logdata.n_obs,)) +logdata.obs["cell_type"] = pd.Categorical(ct) # Categoricals are preferred for efficiency +print(logdata) +logdata.uns["_omicslog"] +``` + + AnnData object with n_obs × n_vars = 100 × 2000 + uns: '_omicslog' + AnnData object with n_obs × n_vars = 100 × 2000 + obs: 'cell_type' + uns: '_omicslog' + + Operation log: + [2026-05-20 13:49:41] obs: 'cell_type' added + + + + + +
+ + + + + + + + + + + + + + + + + + +
TimeOperationMessage
02026-05-20 13:49:41obs'cell_type' added
+
+ + + +## Fltrating by Cells (.obs) + + +```python +logdata = logdata[logdata.obs.cell_type == "B"] +print(logdata) +logdata.uns["_omicslog"] +``` + + AnnData object with n_obs × n_vars = 27 × 2000 + obs: 'cell_type' + uns: '_omicslog' + + Operation log: + [2026-05-20 13:49:41] obs: 'cell_type' added + [2026-05-20 13:49:43] subset: removed 73 samples (73%), 27 samples remaining + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + +
TimeOperationMessage
02026-05-20 13:49:41obs'cell_type' added
12026-05-20 13:49:43subsetremoved 73 samples (73%), 27 samples remaining
+
+ + + +## Filtering by Genes (.var) + + +```python +logdata = logdata[:,logdata.var_names.str.endswith("1")] +print(logdata) +logdata.uns["_omicslog"] +``` + + AnnData object with n_obs × n_vars = 27 × 200 + obs: 'cell_type' + uns: '_omicslog' + + Operation log: + [2026-05-20 13:49:41] obs: 'cell_type' added + [2026-05-20 13:49:43] subset: removed 73 samples (73%), 27 samples remaining + [2026-05-20 13:49:46] subset: removed 1800 genes (90%), 200 genes remaining + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
TimeOperationMessage
02026-05-20 13:49:41obs'cell_type' added
12026-05-20 13:49:43subsetremoved 73 samples (73%), 27 samples remaining
22026-05-20 13:49:46subsetremoved 1800 genes (90%), 200 genes remaining
+
+ + + +## Adding observatons and variables + + +```python +logdata.obsm["X_umap"] = np.random.normal(0, 1, size=(logdata.n_obs, 2)) +logdata.varm["gene_stuff"] = np.random.normal(0, 1, size=(logdata.n_vars, 5)) +print(logdata) +logdata.uns["_omicslog"] +``` + + AnnData object with n_obs × n_vars = 27 × 200 + obs: 'cell_type' + uns: '_omicslog' + obsm: 'X_umap' + varm: 'gene_stuff' + + Operation log: + [2026-05-20 13:49:41] obs: 'cell_type' added + [2026-05-20 13:49:43] subset: removed 73 samples (73%), 27 samples remaining + [2026-05-20 13:49:46] subset: removed 1800 genes (90%), 200 genes remaining + [2026-05-20 13:49:48] obsm: 'X_umap' added + [2026-05-20 13:49:48] varm: 'gene_stuff' added + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
TimeOperationMessage
02026-05-20 13:49:41obs'cell_type' added
12026-05-20 13:49:43subsetremoved 73 samples (73%), 27 samples remaining
22026-05-20 13:49:46subsetremoved 1800 genes (90%), 200 genes remaining
32026-05-20 13:49:48obsm'X_umap' added
42026-05-20 13:49:48varm'gene_stuff' added
+
+ + + +## Adding layers + + +```python +logdata.layers["log_transformed"] = np.log1p(logdata.X) +print(logdata) +logdata.uns["_omicslog"] +``` + + AnnData object with n_obs × n_vars = 27 × 200 + obs: 'cell_type' + uns: '_omicslog' + obsm: 'X_umap' + varm: 'gene_stuff' + layers: 'log_transformed' + + Operation log: + [2026-05-20 13:49:41] obs: 'cell_type' added + [2026-05-20 13:49:43] subset: removed 73 samples (73%), 27 samples remaining + [2026-05-20 13:49:46] subset: removed 1800 genes (90%), 200 genes remaining + [2026-05-20 13:49:48] obsm: 'X_umap' added + [2026-05-20 13:49:48] varm: 'gene_stuff' added + [2026-05-20 13:49:50] layers: 'log_transformed' added + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
TimeOperationMessage
02026-05-20 13:49:41obs'cell_type' added
12026-05-20 13:49:43subsetremoved 73 samples (73%), 27 samples remaining
22026-05-20 13:49:46subsetremoved 1800 genes (90%), 200 genes remaining
32026-05-20 13:49:48obsm'X_umap' added
42026-05-20 13:49:48varm'gene_stuff' added
52026-05-20 13:49:50layers'log_transformed' added
+
-## CI -GitHub Actions builds both images and runs smoke tests on push/PR. diff --git a/analysis/py/adata_reference_code.ipynb b/analysis/py/adata_reference_code.ipynb deleted file mode 100644 index cc84126..0000000 --- a/analysis/py/adata_reference_code.ipynb +++ /dev/null @@ -1,243 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "8fb259db", - "metadata": {}, - "source": [] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "dbb1493c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.12.10\n", - "AnnData object with n_obs × n_vars = 100 × 2000\n", - "Index(['Cell_0', 'Cell_1', 'Cell_2', 'Cell_3', 'Cell_4', 'Cell_5', 'Cell_6',\n", - " 'Cell_7', 'Cell_8', 'Cell_9'],\n", - " dtype='object')\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_45018/475607713.py:5: FutureWarning: `__version__` is deprecated, use `importlib.metadata.version('anndata')` instead.\n", - " print(ad.__version__)\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "import anndata as ad\n", - "from scipy.sparse import csr_matrix\n", - "print(ad.__version__)\n", - "\n", - "counts = csr_matrix(np.random.poisson(1, size=(100, 2000)), dtype=np.float32)\n", - "adata = ad.AnnData(counts)\n", - "print(adata)\n", - "\n", - "adata.obs_names = [f\"Cell_{i:d}\" for i in range(adata.n_obs)]\n", - "adata.var_names = [f\"Gene_{i:d}\" for i in range(adata.n_vars)]\n", - "print(adata.obs_names[:10])" - ] - }, - { - "cell_type": "markdown", - "id": "beed1340", - "metadata": {}, - "source": [ - "# Filtering by Cells" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "f0969e08", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "View of AnnData object with n_obs × n_vars = 2 × 2000" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "adata = adata[[\"Cell_1\", \"Cell_10\"], ]\n", - "adata" - ] - }, - { - "cell_type": "markdown", - "id": "3f9bfc1c", - "metadata": {}, - "source": [ - "# Filtering by cells (.obs)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "a0b17993", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_45018/910546738.py:2: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n", - " adata.obs[\"cell_type\"] = pd.Categorical(ct) # Categoricals are preferred for efficiency\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
cell_type
Cell_1B
Cell_10T
\n", - "
" - ], - "text/plain": [ - " cell_type\n", - "Cell_1 B\n", - "Cell_10 T" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ct = np.random.choice([\"B\", \"T\", \"Monocyte\"], size=(adata.n_obs,))\n", - "adata.obs[\"cell_type\"] = pd.Categorical(ct) # Categoricals are preferred for efficiency\n", - "adata.obs" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "c77a07da", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "View of AnnData object with n_obs × n_vars = 2 × 2000\n", - " obs: 'cell_type'" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "\n", - "adata = adata[adata.obs.cell_type == \"B\"]\n", - "adata" - ] - }, - { - "cell_type": "markdown", - "id": "a3c37b5a", - "metadata": {}, - "source": [ - "# Adding layers" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "fab07d95", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_44426/2818935080.py:1: ImplicitModificationWarning: Setting element `.layers['log_transformed']` of view, initializing view as actual.\n", - " adata.layers[\"log_transformed\"] = np.log1p(adata.X)\n" - ] - }, - { - "data": { - "text/plain": [ - "AnnData object with n_obs × n_vars = 2 × 2000\n", - " obs: 'cell_type'\n", - " layers: 'log_transformed'" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "adata.layers[\"log_transformed\"] = np.log1p(adata.X)\n", - "adata" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.14" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/analysis/py/omicslog_beta.ipynb b/analysis/py/omicslog_beta.ipynb deleted file mode 100644 index d194419..0000000 --- a/analysis/py/omicslog_beta.ipynb +++ /dev/null @@ -1,553 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "f0b85c1e", - "metadata": {}, - "outputs": [], - "source": [ - "from __future__ import annotations\n", - "\n", - "from copy import deepcopy\n", - "from dataclasses import dataclass, field\n", - "from datetime import datetime\n", - "from typing import Any\n", - "\n", - "import anndata as ad\n", - "\n", - "LOG_KEY = \"_omicslog\"\n", - "\n", - "\n", - "def _timestamp() -> str:\n", - " return datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\")\n", - "\n", - "\n", - "def _format_log_message(operation: str, message: str, ts: str | None = None) -> str:\n", - " stamp = ts or _timestamp()\n", - " return f\"[{stamp}] {operation}: {message}\"\n", - "\n", - "\n", - "def _ensure_log_container(adata: ad.AnnData) -> list[str]:\n", - " current = adata.uns.get(LOG_KEY)\n", - " if not isinstance(current, list):\n", - " adata.uns[LOG_KEY] = []\n", - " return adata.uns[LOG_KEY]\n", - "\n", - "\n", - "def _append_log_messages(adata: ad.AnnData, messages: list[str] | tuple[str, ...]) -> None:\n", - " if not messages:\n", - " return\n", - " _ensure_log_container(adata).extend(messages)\n", - "\n", - "\n", - "def _inherit_and_append(\n", - " parent: ad.AnnData,\n", - " child: ad.AnnData,\n", - " messages: list[str] | tuple[str, ...],\n", - ") -> None:\n", - " child.uns[LOG_KEY] = list(_ensure_log_container(parent))\n", - " _append_log_messages(child, messages)\n", - "\n", - "def _parent_set(obj, attr: str, value) -> None:\n", - " \"\"\"Set an attribute via the first parent class that defines it.\n", - " Works for both plain properties (.fset) and custom descriptors (.__set__).\n", - " \"\"\"\n", - " for base in type(obj).__mro__[1:]:\n", - " if attr in base.__dict__:\n", - " base.__dict__[attr].__set__(obj, value)\n", - " return\n", - " object.__setattr__(obj, attr, value)\n", - "\n", - "class _LoggingProxy:\n", - " \"\"\"\n", - " Transparent proxy for dict-like AnnData components (layers, obsm, varm, ...).\n", - " Intercepts __setitem__ and __delitem__ to log mutations automatically.\n", - " \"\"\"\n", - "\n", - " def __init__(self, wrapped, owner: \"LoggedAnnDataStandalone\", label: str):\n", - " object.__setattr__(self, \"_w\", wrapped)\n", - " object.__setattr__(self, \"_owner\", owner)\n", - " object.__setattr__(self, \"_label\", label)\n", - "\n", - " def __setitem__(self, key: str, value) -> None:\n", - " w = object.__getattribute__(self, \"_w\")\n", - " owner = object.__getattribute__(self, \"_owner\")\n", - " label = object.__getattribute__(self, \"_label\")\n", - " verb = \"updated\" if key in w else \"added\"\n", - " w[key] = value\n", - " _append_log_messages(owner, [_format_log_message(label, f\"'{key}' {verb}\")])\n", - "\n", - " def __delitem__(self, key: str) -> None:\n", - " w = object.__getattribute__(self, \"_w\")\n", - " owner = object.__getattribute__(self, \"_owner\")\n", - " label = object.__getattribute__(self, \"_label\")\n", - " del w[key]\n", - " _append_log_messages(owner, [_format_log_message(label, f\"'{key}' removed\")])\n", - "\n", - " def __getitem__(self, key):\n", - " return object.__getattribute__(self, \"_w\")[key]\n", - "\n", - " def __getattr__(self, name):\n", - " return getattr(object.__getattribute__(self, \"_w\"), name)\n", - "\n", - " def __contains__(self, key):\n", - " return key in object.__getattribute__(self, \"_w\")\n", - "\n", - " def __iter__(self):\n", - " return iter(object.__getattribute__(self, \"_w\"))\n", - "\n", - " def __len__(self):\n", - " return len(object.__getattribute__(self, \"_w\"))\n", - "\n", - " def __repr__(self):\n", - " return repr(object.__getattribute__(self, \"_w\"))\n", - "\n", - "\n", - "@dataclass\n", - "class AnnDataSnapshot:\n", - " \"\"\"Captures the full state of an AnnData object for diffing.\"\"\"\n", - " n_obs: int\n", - " n_vars: int\n", - " obs_cols: list[str] = field(default_factory=list)\n", - " var_cols: list[str] = field(default_factory=list)\n", - " layers: list[str] = field(default_factory=list)\n", - " obsm: list[str] = field(default_factory=list)\n", - " varm: list[str] = field(default_factory=list)\n", - " obsp: list[str] = field(default_factory=list)\n", - " varp: list[str] = field(default_factory=list)\n", - "\n", - " @classmethod\n", - " def from_anndata(cls, adata: ad.AnnData) -> \"AnnDataSnapshot\":\n", - " return cls(\n", - " n_obs=adata.n_obs,\n", - " n_vars=adata.n_vars,\n", - " obs_cols=list(adata.obs.columns),\n", - " var_cols=list(adata.var.columns),\n", - " layers=list(adata.layers.keys()),\n", - " obsm=list(adata.obsm.keys()),\n", - " varm=list(adata.varm.keys()),\n", - " obsp=list(adata.obsp.keys()),\n", - " varp=list(adata.varp.keys()),\n", - " )\n", - "\n", - "\n", - "def _diff_keys(\n", - " pre: list[str],\n", - " post: list[str],\n", - " label: str,\n", - " operation: str,\n", - " ts: str,\n", - ") -> list[str]:\n", - " msgs = []\n", - " for k in sorted(set(pre) - set(post)):\n", - " msgs.append(_format_log_message(operation, f\"{label} removed: '{k}'\", ts))\n", - " for k in sorted(set(post) - set(pre)):\n", - " msgs.append(_format_log_message(operation, f\"{label} added: '{k}'\", ts))\n", - " return msgs\n", - "\n", - "\n", - "def _subset_messages(\n", - " pre: AnnDataSnapshot,\n", - " post: AnnDataSnapshot,\n", - " operation: str = \"subset\",\n", - ") -> list[str]:\n", - " msgs: list[str] = []\n", - " ts = _timestamp()\n", - "\n", - " if pre.n_vars != post.n_vars:\n", - " removed = pre.n_vars - post.n_vars\n", - " pct = round((removed / pre.n_vars) * 100) if pre.n_vars else 0\n", - " msgs.append(_format_log_message(\n", - " operation,\n", - " f\"removed {removed} genes ({pct}%), {post.n_vars} genes remaining\",\n", - " ts,\n", - " ))\n", - "\n", - " if pre.n_obs != post.n_obs:\n", - " removed = pre.n_obs - post.n_obs\n", - " pct = round((removed / pre.n_obs) * 100) if pre.n_obs else 0\n", - " msgs.append(_format_log_message(\n", - " operation,\n", - " f\"removed {removed} samples ({pct}%), {post.n_obs} samples remaining\",\n", - " ts,\n", - " ))\n", - "\n", - " msgs += _diff_keys(pre.obs_cols, post.obs_cols, \"obs column\", operation, ts)\n", - " msgs += _diff_keys(pre.var_cols, post.var_cols, \"var column\", operation, ts)\n", - " msgs += _diff_keys(pre.layers, post.layers, \"layer\", operation, ts)\n", - " msgs += _diff_keys(pre.obsm, post.obsm, \"obsm\", operation, ts)\n", - " msgs += _diff_keys(pre.varm, post.varm, \"varm\", operation, ts)\n", - " msgs += _diff_keys(pre.obsp, post.obsp, \"obsp\", operation, ts)\n", - " msgs += _diff_keys(pre.varp, post.varp, \"varp\", operation, ts)\n", - "\n", - " return msgs\n", - "\n", - "\n", - "class LoggedAnnDataStandalone(ad.AnnData):\n", - " \"\"\"Standalone subclass strategy with local logging helpers and message style.\"\"\"\n", - "\n", - " def __init__(self, *args: Any, **kwargs: Any):\n", - " super().__init__(*args, **kwargs)\n", - " _ensure_log_container(self)\n", - "\n", - " @classmethod\n", - " def _safe_component_copy(cls, value):\n", - " return value.copy() if hasattr(value, \"copy\") else deepcopy(value)\n", - "\n", - " @classmethod\n", - " def from_anndata(cls, adata: ad.AnnData) -> \"LoggedAnnDataStandalone\":\n", - " if isinstance(adata, cls):\n", - " _ensure_log_container(adata)\n", - " return adata\n", - "\n", - " kwargs: dict[str, Any] = {\n", - " \"X\": cls._safe_component_copy(adata.X) if adata.X is not None else None,\n", - " \"obs\": adata.obs.copy(),\n", - " \"var\": adata.var.copy(),\n", - " \"uns\": deepcopy(dict(adata.uns)),\n", - " \"obsm\": {k: cls._safe_component_copy(v) for k, v in adata.obsm.items()},\n", - " \"varm\": {k: cls._safe_component_copy(v) for k, v in adata.varm.items()},\n", - " \"layers\": {k: cls._safe_component_copy(v) for k, v in adata.layers.items()},\n", - " \"obsp\": {k: cls._safe_component_copy(v) for k, v in adata.obsp.items()},\n", - " \"varp\": {k: cls._safe_component_copy(v) for k, v in adata.varp.items()},\n", - " }\n", - " if adata.raw is not None:\n", - " kwargs[\"raw\"] = {\n", - " \"X\": cls._safe_component_copy(adata.raw.X),\n", - " \"var\": adata.raw.var.copy(),\n", - " \"varm\": {k: cls._safe_component_copy(v) for k, v in adata.raw.varm.items()},\n", - " }\n", - "\n", - " logged = cls(**kwargs)\n", - " _ensure_log_container(logged)\n", - " return logged\n", - "\n", - " # --- proxied properties: each needs a getter AND a setter ---\n", - "\n", - " @property\n", - " def layers(self):\n", - " return _LoggingProxy(super().layers, self, \"layers\")\n", - "\n", - " @layers.setter\n", - " def layers(self, value):\n", - " _parent_set(self, \"layers\", value)\n", - "\n", - " @property\n", - " def obsm(self):\n", - " return _LoggingProxy(super().obsm, self, \"obsm\")\n", - "\n", - " @obsm.setter\n", - " def obsm(self, value):\n", - " _parent_set(self, \"obsm\", value)\n", - "\n", - " @property\n", - " def varm(self):\n", - " return _LoggingProxy(super().varm, self, \"varm\")\n", - "\n", - " @varm.setter\n", - " def varm(self, value):\n", - " _parent_set(self, \"varm\", value)\n", - "\n", - " @property\n", - " def obsp(self):\n", - " return _LoggingProxy(super().obsp, self, \"obsp\")\n", - "\n", - " @obsp.setter\n", - " def obsp(self, value):\n", - " _parent_set(self, \"obsp\", value)\n", - "\n", - " @property\n", - " def varp(self):\n", - " return _LoggingProxy(super().varp, self, \"varp\")\n", - "\n", - " @varp.setter\n", - " def varp(self, value):\n", - " _parent_set(self, \"varp\", value)\n", - "\n", - " @property\n", - " def obs(self):\n", - " return _LoggingProxy(super().obs, self, \"obs\")\n", - "\n", - " @obs.setter\n", - " def obs(self, value):\n", - " _parent_set(self, \"obs\", value)\n", - "\n", - " @property\n", - " def var(self):\n", - " return _LoggingProxy(super().var, self, \"var\")\n", - "\n", - " @var.setter\n", - " def var(self, value):\n", - " ad.AnnData.var.fset(self, value)\n", - "\n", - " # --- snapshot & subsetting ---\n", - "\n", - " def _snapshot(self) -> AnnDataSnapshot:\n", - " return AnnDataSnapshot.from_anndata(self)\n", - "\n", - " def __getitem__(self, index):\n", - " pre = self._snapshot()\n", - " result = super().__getitem__(index)\n", - " logged_result = self.from_anndata(result)\n", - " msgs = _subset_messages(pre, logged_result._snapshot(), operation=\"subset\")\n", - " _inherit_and_append(self, logged_result, msgs)\n", - " return logged_result\n", - "\n", - " def _inplace_subset(self, index):\n", - " pre = self._snapshot()\n", - " super()._inplace_subset(index)\n", - " _append_log_messages(self, _subset_messages(pre, self._snapshot(), operation=\"subset\"))\n", - "\n", - " def _operation_log_block(self) -> str:\n", - " logs = self.uns.get(LOG_KEY, [])\n", - " if not logs:\n", - " return \"\"\n", - " return \"\\n\\nOperation log:\\n\" + \"\\n\".join(str(x) for x in logs)\n", - "\n", - " def __repr__(self) -> str:\n", - " return super().__repr__() + self._operation_log_block()\n", - "\n", - " def __str__(self) -> str:\n", - " return self.__repr__()\n", - "\n", - " def operation_log(self) -> list[str]:\n", - " return list(self.uns.get(LOG_KEY, []))\n", - "\n", - "\n", - "def log_start(adata: ad.AnnData) -> LoggedAnnDataStandalone:\n", - " return LoggedAnnDataStandalone.from_anndata(adata)" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "a256216a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.12.10\n", - "AnnData object with n_obs × n_vars = 100 × 2000\n", - " uns: '_omicslog'\n", - "AnnData object with n_obs × n_vars = 100 × 2000\n", - " obs: 'cell_type'\n", - " uns: '_omicslog'\n", - "\n", - "Operation log:\n", - "[2026-03-27 16:38:57] obs: 'cell_type' added\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_785154/2927559619.py:5: FutureWarning: `__version__` is deprecated, use `importlib.metadata.version('anndata')` instead.\n", - " print(ad.__version__)\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "import anndata as ad\n", - "from scipy.sparse import csr_matrix\n", - "print(ad.__version__)\n", - "\n", - "counts = csr_matrix(np.random.poisson(1, size=(100, 2000)), dtype=np.float32)\n", - "adata = ad.AnnData(counts)\n", - "\n", - "adata.obs_names = [f\"Cell_{i:d}\" for i in range(adata.n_obs)]\n", - "adata.var_names = [f\"Gene_{i:d}\" for i in range(adata.n_vars)]\n", - "\n", - "logdata = log_start(adata)\n", - "print(logdata)\n", - "\n", - "ct = np.random.choice([\"B\", \"T\", \"Monocyte\"], size=(logdata.n_obs,))\n", - "logdata.obs[\"cell_type\"] = pd.Categorical(ct) # Categoricals are preferred for efficiency\n", - "print(logdata)" - ] - }, - { - "cell_type": "markdown", - "id": "8e0e24d1", - "metadata": {}, - "source": [ - "# Fltrating by Cells (.obs)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "44d8de87", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AnnData object with n_obs × n_vars = 31 × 2000\n", - " obs: 'cell_type'\n", - " uns: '_omicslog'\n", - "\n", - "Operation log:\n", - "[2026-03-27 16:38:57] obs: 'cell_type' added\n", - "[2026-03-27 16:38:57] subset: removed 69 samples (69%), 31 samples remaining" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "logdata = logdata[logdata.obs.cell_type == \"B\"]\n", - "logdata" - ] - }, - { - "cell_type": "markdown", - "id": "0f4ac517", - "metadata": {}, - "source": [ - "# Filtering by Genes (.var)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "af577b51", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AnnData object with n_obs × n_vars = 31 × 200\n", - " obs: 'cell_type'\n", - " uns: '_omicslog'\n", - "\n", - "Operation log:\n", - "[2026-03-27 16:38:57] obs: 'cell_type' added\n", - "[2026-03-27 16:38:57] subset: removed 69 samples (69%), 31 samples remaining\n", - "[2026-03-27 16:38:57] subset: removed 1800 genes (90%), 200 genes remaining" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "logdata = logdata[:,logdata.var_names.str.endswith(\"1\")]\n", - "logdata" - ] - }, - { - "cell_type": "markdown", - "id": "f1a77680", - "metadata": {}, - "source": [ - "# Adding observatons and variables" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "70952826", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AnnData object with n_obs × n_vars = 31 × 200\n", - " obs: 'cell_type'\n", - " uns: '_omicslog'\n", - " obsm: 'X_umap'\n", - " varm: 'gene_stuff'\n", - "\n", - "Operation log:\n", - "[2026-03-27 16:38:57] obs: 'cell_type' added\n", - "[2026-03-27 16:38:57] subset: removed 69 samples (69%), 31 samples remaining\n", - "[2026-03-27 16:38:57] subset: removed 1800 genes (90%), 200 genes remaining\n", - "[2026-03-27 16:38:57] obsm: 'X_umap' added\n", - "[2026-03-27 16:38:57] varm: 'gene_stuff' added" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "logdata.obsm[\"X_umap\"] = np.random.normal(0, 1, size=(logdata.n_obs, 2))\n", - "logdata.varm[\"gene_stuff\"] = np.random.normal(0, 1, size=(logdata.n_vars, 5))\n", - "logdata" - ] - }, - { - "cell_type": "markdown", - "id": "d9a5c3b4", - "metadata": {}, - "source": [ - "# Adding layers" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "529860ba", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AnnData object with n_obs × n_vars = 31 × 200\n", - " obs: 'cell_type'\n", - " uns: '_omicslog'\n", - " obsm: 'X_umap'\n", - " varm: 'gene_stuff'\n", - " layers: 'log_transformed'\n", - "\n", - "Operation log:\n", - "[2026-03-27 16:38:57] obs: 'cell_type' added\n", - "[2026-03-27 16:38:57] subset: removed 69 samples (69%), 31 samples remaining\n", - "[2026-03-27 16:38:57] subset: removed 1800 genes (90%), 200 genes remaining\n", - "[2026-03-27 16:38:57] obsm: 'X_umap' added\n", - "[2026-03-27 16:38:57] varm: 'gene_stuff' added\n", - "[2026-03-27 16:38:57] layers: 'log_transformed' added" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "logdata.layers[\"log_transformed\"] = np.log1p(logdata.X)\n", - "logdata" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.14" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/analysis/py/smoke.py b/analysis/py/smoke.py deleted file mode 100644 index eb17e7b..0000000 --- a/analysis/py/smoke.py +++ /dev/null @@ -1,27 +0,0 @@ -import sys -from pathlib import Path - -def main() -> None: - print("Python:", sys.version) - - # Minimal deps: numpy + sklearn (small, common). Add torch as needed. - import numpy as np # noqa: F401 - from sklearn.linear_model import LogisticRegression - - X = np.random.randn(100, 5) - y = (X[:, 0] + 0.1 * X[:, 1] > 0).astype(int) - - clf = LogisticRegression(max_iter=200) - clf.fit(X, y) - acc = clf.score(X, y) - - out = Path("results") - out.mkdir(exist_ok=True) - (out / "smoke_metrics.txt").write_text(f"train_acc={acc:.3f}\n") - - print("Python smoke test completed; wrote results/smoke_metrics.txt") - - -if __name__ == "__main__": - main() - diff --git a/analysis/r/smoke.R b/analysis/r/smoke.R deleted file mode 100644 index fdc5fb0..0000000 --- a/analysis/r/smoke.R +++ /dev/null @@ -1,26 +0,0 @@ -# Minimal smoke test for R container -message("R version: ", R.version.string) - -# Ensure renv works -if (!requireNamespace("renv", quietly = TRUE)) { - stop("renv not installed") -} - -# Optional: install a tiny plotting stack in project env if not present -pkgs <- c("ggplot2") -missing <- pkgs[!vapply(pkgs, requireNamespace, logical(1), quietly = TRUE)] -if (length(missing) > 0) { - message("Installing missing packages: ", paste(missing, collapse = ", ")) - install.packages(missing, repos = "https://cloud.r-project.org") -} - -library(ggplot2) - -df <- data.frame(x = 1:10, y = (1:10)^2) -p <- ggplot(df, aes(x, y)) + geom_point() + ggtitle("Smoke test plot") - -dir.create("results", showWarnings = FALSE) -ggsave("results/smoke_plot.png", p, width = 6, height = 4, dpi = 150) - -message("R smoke test completed; wrote results/smoke_plot.png") - diff --git a/analysis/r/template.qmd b/analysis/r/template.qmd deleted file mode 100644 index e71273b..0000000 --- a/analysis/r/template.qmd +++ /dev/null @@ -1,22 +0,0 @@ ---- -title: "" -author: Juan Henao -date: '`r format(Sys.time(), "%d %B, %Y")`' -description: "" -title-block-banner: "black" -quarto: - components: - panel-tabset: - max_items: 10 -format: - html: - embed-resources: true - smooth-scroll: true - anchor-sections: true - number-sections: true - toc: true - toc-location: left - code-fold: true - theme: cerulean -editor: visual ---- diff --git a/demo.ipynb b/demo.ipynb new file mode 100644 index 0000000..9cf0481 --- /dev/null +++ b/demo.ipynb @@ -0,0 +1,585 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e19092cb", + "metadata": {}, + "source": [ + "# Omicslog\n", + "\n", + "## Importing packages" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "14411cbf", + "metadata": {}, + "outputs": [], + "source": [ + "from omicslog import log_start\n", + "import numpy as np\n", + "import pandas as pd\n", + "import anndata as ad\n", + "from scipy.sparse import csr_matrix\n" + ] + }, + { + "cell_type": "markdown", + "id": "99bec3f9", + "metadata": {}, + "source": [ + "
\n", + "📝 Note
\n", + "The AnnData object were generated using code from the original AnnData documentation.\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "c285a9b4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AnnData object with n_obs × n_vars = 100 × 2000\n", + " uns: '_omicslog'\n", + "AnnData object with n_obs × n_vars = 100 × 2000\n", + " obs: 'cell_type'\n", + " uns: '_omicslog'\n", + "\n", + "Operation log:\n", + "[2026-05-20 13:49:41] obs: 'cell_type' added\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TimeOperationMessage
02026-05-20 13:49:41obs'cell_type' added
\n", + "
" + ], + "text/plain": [ + " Time Operation Message\n", + "0 2026-05-20 13:49:41 obs 'cell_type' added" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "counts = csr_matrix(np.random.poisson(1, size=(100, 2000)), dtype=np.float32)\n", + "adata = ad.AnnData(counts)\n", + "\n", + "adata.obs_names = [f\"Cell_{i:d}\" for i in range(adata.n_obs)]\n", + "adata.var_names = [f\"Gene_{i:d}\" for i in range(adata.n_vars)]\n", + "\n", + "logdata = log_start(adata)\n", + "print(logdata)\n", + "\n", + "ct = np.random.choice([\"B\", \"T\", \"Monocyte\"], size=(logdata.n_obs,))\n", + "logdata.obs[\"cell_type\"] = pd.Categorical(ct) # Categoricals are preferred for efficiency\n", + "print(logdata)\n", + "logdata.uns[\"_omicslog\"]" + ] + }, + { + "cell_type": "markdown", + "id": "b88c830d", + "metadata": {}, + "source": [ + "## Fltrating by Cells (.obs)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "b88b1e3a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AnnData object with n_obs × n_vars = 27 × 2000\n", + " obs: 'cell_type'\n", + " uns: '_omicslog'\n", + "\n", + "Operation log:\n", + "[2026-05-20 13:49:41] obs: 'cell_type' added\n", + "[2026-05-20 13:49:43] subset: removed 73 samples (73%), 27 samples remaining\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TimeOperationMessage
02026-05-20 13:49:41obs'cell_type' added
12026-05-20 13:49:43subsetremoved 73 samples (73%), 27 samples remaining
\n", + "
" + ], + "text/plain": [ + " Time Operation \\\n", + "0 2026-05-20 13:49:41 obs \n", + "1 2026-05-20 13:49:43 subset \n", + "\n", + " Message \n", + "0 'cell_type' added \n", + "1 removed 73 samples (73%), 27 samples remaining " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "logdata = logdata[logdata.obs.cell_type == \"B\"]\n", + "print(logdata)\n", + "logdata.uns[\"_omicslog\"]" + ] + }, + { + "cell_type": "markdown", + "id": "6bcdb40a", + "metadata": {}, + "source": [ + "## Filtering by Genes (.var)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "8f7ed85f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AnnData object with n_obs × n_vars = 27 × 200\n", + " obs: 'cell_type'\n", + " uns: '_omicslog'\n", + "\n", + "Operation log:\n", + "[2026-05-20 13:49:41] obs: 'cell_type' added\n", + "[2026-05-20 13:49:43] subset: removed 73 samples (73%), 27 samples remaining\n", + "[2026-05-20 13:49:46] subset: removed 1800 genes (90%), 200 genes remaining\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TimeOperationMessage
02026-05-20 13:49:41obs'cell_type' added
12026-05-20 13:49:43subsetremoved 73 samples (73%), 27 samples remaining
22026-05-20 13:49:46subsetremoved 1800 genes (90%), 200 genes remaining
\n", + "
" + ], + "text/plain": [ + " Time Operation \\\n", + "0 2026-05-20 13:49:41 obs \n", + "1 2026-05-20 13:49:43 subset \n", + "2 2026-05-20 13:49:46 subset \n", + "\n", + " Message \n", + "0 'cell_type' added \n", + "1 removed 73 samples (73%), 27 samples remaining \n", + "2 removed 1800 genes (90%), 200 genes remaining " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "logdata = logdata[:,logdata.var_names.str.endswith(\"1\")]\n", + "print(logdata)\n", + "logdata.uns[\"_omicslog\"]" + ] + }, + { + "cell_type": "markdown", + "id": "a679a08e", + "metadata": {}, + "source": [ + "## Adding observatons and variables" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "0fb2c8b9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AnnData object with n_obs × n_vars = 27 × 200\n", + " obs: 'cell_type'\n", + " uns: '_omicslog'\n", + " obsm: 'X_umap'\n", + " varm: 'gene_stuff'\n", + "\n", + "Operation log:\n", + "[2026-05-20 13:49:41] obs: 'cell_type' added\n", + "[2026-05-20 13:49:43] subset: removed 73 samples (73%), 27 samples remaining\n", + "[2026-05-20 13:49:46] subset: removed 1800 genes (90%), 200 genes remaining\n", + "[2026-05-20 13:49:48] obsm: 'X_umap' added\n", + "[2026-05-20 13:49:48] varm: 'gene_stuff' added\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TimeOperationMessage
02026-05-20 13:49:41obs'cell_type' added
12026-05-20 13:49:43subsetremoved 73 samples (73%), 27 samples remaining
22026-05-20 13:49:46subsetremoved 1800 genes (90%), 200 genes remaining
32026-05-20 13:49:48obsm'X_umap' added
42026-05-20 13:49:48varm'gene_stuff' added
\n", + "
" + ], + "text/plain": [ + " Time Operation \\\n", + "0 2026-05-20 13:49:41 obs \n", + "1 2026-05-20 13:49:43 subset \n", + "2 2026-05-20 13:49:46 subset \n", + "3 2026-05-20 13:49:48 obsm \n", + "4 2026-05-20 13:49:48 varm \n", + "\n", + " Message \n", + "0 'cell_type' added \n", + "1 removed 73 samples (73%), 27 samples remaining \n", + "2 removed 1800 genes (90%), 200 genes remaining \n", + "3 'X_umap' added \n", + "4 'gene_stuff' added " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "logdata.obsm[\"X_umap\"] = np.random.normal(0, 1, size=(logdata.n_obs, 2))\n", + "logdata.varm[\"gene_stuff\"] = np.random.normal(0, 1, size=(logdata.n_vars, 5))\n", + "print(logdata)\n", + "logdata.uns[\"_omicslog\"]" + ] + }, + { + "cell_type": "markdown", + "id": "2021a54b", + "metadata": {}, + "source": [ + "## Adding layers" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "a8c41222", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AnnData object with n_obs × n_vars = 27 × 200\n", + " obs: 'cell_type'\n", + " uns: '_omicslog'\n", + " obsm: 'X_umap'\n", + " varm: 'gene_stuff'\n", + " layers: 'log_transformed'\n", + "\n", + "Operation log:\n", + "[2026-05-20 13:49:41] obs: 'cell_type' added\n", + "[2026-05-20 13:49:43] subset: removed 73 samples (73%), 27 samples remaining\n", + "[2026-05-20 13:49:46] subset: removed 1800 genes (90%), 200 genes remaining\n", + "[2026-05-20 13:49:48] obsm: 'X_umap' added\n", + "[2026-05-20 13:49:48] varm: 'gene_stuff' added\n", + "[2026-05-20 13:49:50] layers: 'log_transformed' added\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TimeOperationMessage
02026-05-20 13:49:41obs'cell_type' added
12026-05-20 13:49:43subsetremoved 73 samples (73%), 27 samples remaining
22026-05-20 13:49:46subsetremoved 1800 genes (90%), 200 genes remaining
32026-05-20 13:49:48obsm'X_umap' added
42026-05-20 13:49:48varm'gene_stuff' added
52026-05-20 13:49:50layers'log_transformed' added
\n", + "
" + ], + "text/plain": [ + " Time Operation \\\n", + "0 2026-05-20 13:49:41 obs \n", + "1 2026-05-20 13:49:43 subset \n", + "2 2026-05-20 13:49:46 subset \n", + "3 2026-05-20 13:49:48 obsm \n", + "4 2026-05-20 13:49:48 varm \n", + "5 2026-05-20 13:49:50 layers \n", + "\n", + " Message \n", + "0 'cell_type' added \n", + "1 removed 73 samples (73%), 27 samples remaining \n", + "2 removed 1800 genes (90%), 200 genes remaining \n", + "3 'X_umap' added \n", + "4 'gene_stuff' added \n", + "5 'log_transformed' added " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "logdata.layers[\"log_transformed\"] = np.log1p(logdata.X)\n", + "print(logdata)\n", + "logdata.uns[\"_omicslog\"]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/make_readme.sh b/make_readme.sh new file mode 100644 index 0000000..cf971a5 --- /dev/null +++ b/make_readme.sh @@ -0,0 +1 @@ +poetry run jupyter nbconvert demo.ipynb --to markdown --output README \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index fdd9235..431a26b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,20 +1,33 @@ [tool.poetry] -name = "project" +name = "omicslog" version = "0.1.0" -description = "CompBio template: Python (Poetry) + R (renv) in separate devcontainers" -authors = ["Your Name "] +description = "AnnData subclass that automatically logs mutations to .uns['_omicslog']" +authors = ["Stefano Mangiola ", "Juan Henao "] +manteiners = ["Juan Henao "] +license = "MIT" readme = "README.md" -package-mode = false +homepage = "https://github.com/tidyomics/omicslog_dev" +repository = "https://github.com/tidyomics/omicslog_dev" +keywords = ["bioinformatics", "single-cell", "anndata", "logging"] +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Bio-Informatics", +] +packages = [{include = "omicslog", from = "src"}] [tool.poetry.dependencies] python = ">=3.11,<3.12" +anndata = ">=0.10" numpy = "^2.0.0" -scikit-learn = "^1.5.0" [tool.poetry.group.dev.dependencies] ruff = "^0.6.0" black = "^24.8.0" ipykernel = "^7.2.0" +pytest = "^8.0.0" +scipy = "^1.14.0" [build-system] requires = ["poetry-core"] @@ -25,4 +38,3 @@ line-length = 100 [tool.black] line-length = 100 - diff --git a/src/omicslog/__init__.py b/src/omicslog/__init__.py new file mode 100644 index 0000000..71c626b --- /dev/null +++ b/src/omicslog/__init__.py @@ -0,0 +1,7 @@ +from omicslog.core import ( + LoggedAnnDataStandalone, + AnnDataSnapshot, + log_start, +) + +__all__ = ["LoggedAnnDataStandalone", "AnnDataSnapshot", "log_start"] diff --git a/src/omicslog/core.py b/src/omicslog/core.py new file mode 100644 index 0000000..40db784 --- /dev/null +++ b/src/omicslog/core.py @@ -0,0 +1,327 @@ +from __future__ import annotations + +from copy import deepcopy +from dataclasses import dataclass, field +from datetime import datetime +from typing import Any + +import anndata as ad +import pandas as pd + +LOG_KEY = "_omicslog" + +def _safe_deepcopy_dict(d: dict) -> dict: + result = {} + for k, v in d.items(): + try: + result[k] = deepcopy(v) + except (TypeError, Exception): + pass + return result + + +def _timestamp() -> str: + return datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + +def _format_log_message(operation: str, message: str, ts: str | None = None) -> list[str]: + stamp = ts or _timestamp() + return [stamp, operation, message] + + +def _ensure_log_container(adata: ad.AnnData) -> pd.DataFrame: + current = adata.uns.get(LOG_KEY) + if not isinstance(current, pd.DataFrame): + adata.uns[LOG_KEY] = pd.DataFrame(columns=["Time","Operation","Message"]) + return adata.uns[LOG_KEY] + + +def _append_log_messages(adata: ad.AnnData, messages: list[str] | tuple[str, ...]) -> None: + if not messages: + return + container = _ensure_log_container(adata) + new_rows = pd.DataFrame(messages, columns=["Time", "Operation", "Message"]) + adata.uns[LOG_KEY] = pd.concat([container, new_rows], ignore_index=True) + +def _inherit_and_append( + parent: ad.AnnData, + child: ad.AnnData, + messages: list[str] | tuple[str, ...], +) -> None: + child.uns[LOG_KEY] = _ensure_log_container(parent) + _append_log_messages(child, messages) + +def _parent_set(obj, attr: str, value) -> None: + """Set an attribute via the first parent class that defines it. + Works for both plain properties (.fset) and custom descriptors (.__set__). + """ + for base in type(obj).__mro__[1:]: + if attr in base.__dict__: + base.__dict__[attr].__set__(obj, value) + return + object.__setattr__(obj, attr, value) + +class _LoggingProxy: + """ + Transparent proxy for dict-like AnnData components (layers, obsm, varm, ...). + Intercepts __setitem__ and __delitem__ to log mutations automatically. + """ + + def __init__(self, wrapped, owner: "LoggedAnnDataStandalone", label: str): + object.__setattr__(self, "_w", wrapped) + object.__setattr__(self, "_owner", owner) + object.__setattr__(self, "_label", label) + + def __setitem__(self, key: str, value) -> None: + w = object.__getattribute__(self, "_w") + owner = object.__getattribute__(self, "_owner") + label = object.__getattribute__(self, "_label") + verb = "updated" if key in w else "added" + w[key] = value + _append_log_messages(owner, [_format_log_message(label, f"'{key}' {verb}")]) + + def __delitem__(self, key: str) -> None: + w = object.__getattribute__(self, "_w") + owner = object.__getattribute__(self, "_owner") + label = object.__getattribute__(self, "_label") + del w[key] + _append_log_messages(owner, [_format_log_message(label, f"'{key}' removed")]) + + def __getitem__(self, key): + return object.__getattribute__(self, "_w")[key] + + def __getattr__(self, name): + return getattr(object.__getattribute__(self, "_w"), name) + + def __contains__(self, key): + return key in object.__getattribute__(self, "_w") + + def __iter__(self): + return iter(object.__getattribute__(self, "_w")) + + def __len__(self): + return len(object.__getattribute__(self, "_w")) + + def __repr__(self): + return repr(object.__getattribute__(self, "_w")) + + +@dataclass +class AnnDataSnapshot: + """Captures the full state of an AnnData object for diffing.""" + n_obs: int + n_vars: int + obs_cols: list[str] = field(default_factory=list) + var_cols: list[str] = field(default_factory=list) + layers: list[str] = field(default_factory=list) + obsm: list[str] = field(default_factory=list) + varm: list[str] = field(default_factory=list) + obsp: list[str] = field(default_factory=list) + varp: list[str] = field(default_factory=list) + + @classmethod + def from_anndata(cls, adata: ad.AnnData) -> "AnnDataSnapshot": + return cls( + n_obs=adata.n_obs, + n_vars=adata.n_vars, + obs_cols=list(adata.obs.columns), + var_cols=list(adata.var.columns), + layers=list(adata.layers.keys()), + obsm=list(adata.obsm.keys()), + varm=list(adata.varm.keys()), + obsp=list(adata.obsp.keys()), + varp=list(adata.varp.keys()), + ) + + +def _diff_keys( + pre: list[str], + post: list[str], + label: str, + operation: str, + ts: str, +) -> list[str]: + msgs = [] + for k in sorted(set(pre) - set(post)): + msgs.append(_format_log_message(operation, f"{label} removed: '{k}'", ts)) + for k in sorted(set(post) - set(pre)): + msgs.append(_format_log_message(operation, f"{label} added: '{k}'", ts)) + return msgs + + +def _subset_messages( + pre: AnnDataSnapshot, + post: AnnDataSnapshot, + operation: str = "subset", +) -> list[str]: + msgs: list[str] = [] + ts = _timestamp() + + if pre.n_vars != post.n_vars: + removed = pre.n_vars - post.n_vars + pct = round((removed / pre.n_vars) * 100) if pre.n_vars else 0 + msgs.append(_format_log_message( + operation, + f"removed {removed} genes ({pct}%), {post.n_vars} genes remaining", + ts, + )) + + if pre.n_obs != post.n_obs: + removed = pre.n_obs - post.n_obs + pct = round((removed / pre.n_obs) * 100) if pre.n_obs else 0 + msgs.append(_format_log_message( + operation, + f"removed {removed} samples ({pct}%), {post.n_obs} samples remaining", + ts, + )) + + msgs += _diff_keys(pre.obs_cols, post.obs_cols, "obs column", operation, ts) + msgs += _diff_keys(pre.var_cols, post.var_cols, "var column", operation, ts) + msgs += _diff_keys(pre.layers, post.layers, "layer", operation, ts) + msgs += _diff_keys(pre.obsm, post.obsm, "obsm", operation, ts) + msgs += _diff_keys(pre.varm, post.varm, "varm", operation, ts) + msgs += _diff_keys(pre.obsp, post.obsp, "obsp", operation, ts) + msgs += _diff_keys(pre.varp, post.varp, "varp", operation, ts) + + return msgs + + +class LoggedAnnDataStandalone(ad.AnnData): + """Standalone subclass strategy with local logging helpers and message style.""" + + def __init__(self, *args: Any, **kwargs: Any): + super().__init__(*args, **kwargs) + _ensure_log_container(self) + + @classmethod + def _safe_component_copy(cls, value): + return value.copy() if hasattr(value, "copy") else deepcopy(value) + + @classmethod + def from_anndata(cls, adata: ad.AnnData) -> "LoggedAnnDataStandalone": + if isinstance(adata, cls): + _ensure_log_container(adata) + return adata + + kwargs: dict[str, Any] = { + "X": cls._safe_component_copy(adata.X) if adata.X is not None else None, + "obs": adata.obs.copy(), + "var": adata.var.copy(), + "uns": _safe_deepcopy_dict(dict(adata.uns)), + "obsm": {k: cls._safe_component_copy(v) for k, v in adata.obsm.items()}, + "varm": {k: cls._safe_component_copy(v) for k, v in adata.varm.items()}, + "layers": {k: cls._safe_component_copy(v) for k, v in adata.layers.items()}, + "obsp": {k: cls._safe_component_copy(v) for k, v in adata.obsp.items()}, + "varp": {k: cls._safe_component_copy(v) for k, v in adata.varp.items()}, + } + + if adata.raw is not None: + kwargs["raw"] = { + "X": cls._safe_component_copy(adata.raw.X), + "var": adata.raw.var.copy(), + "varm": {k: cls._safe_component_copy(v) for k, v in adata.raw.varm.items()}, + } + + logged = cls(**kwargs) + _ensure_log_container(logged) + return logged + + # --- proxied properties: each needs a getter AND a setter --- + + @property + def layers(self): + return _LoggingProxy(super().layers, self, "layers") + + @layers.setter + def layers(self, value): + _parent_set(self, "layers", value) + + @property + def obsm(self): + return _LoggingProxy(super().obsm, self, "obsm") + + @obsm.setter + def obsm(self, value): + _parent_set(self, "obsm", value) + + @property + def varm(self): + return _LoggingProxy(super().varm, self, "varm") + + @varm.setter + def varm(self, value): + _parent_set(self, "varm", value) + + @property + def obsp(self): + return _LoggingProxy(super().obsp, self, "obsp") + + @obsp.setter + def obsp(self, value): + _parent_set(self, "obsp", value) + + @property + def varp(self): + return _LoggingProxy(super().varp, self, "varp") + + @varp.setter + def varp(self, value): + _parent_set(self, "varp", value) + + @property + def obs(self): + return _LoggingProxy(super().obs, self, "obs") + + @obs.setter + def obs(self, value): + _parent_set(self, "obs", value) + + @property + def var(self): + return _LoggingProxy(super().var, self, "var") + + @var.setter + def var(self, value): + ad.AnnData.var.fset(self, value) + + # --- snapshot & subsetting --- + + def _snapshot(self) -> AnnDataSnapshot: + return AnnDataSnapshot.from_anndata(self) + + def __getitem__(self, index): + pre = self._snapshot() + result = super().__getitem__(index) + logged_result = self.from_anndata(result) + msgs = _subset_messages(pre, logged_result._snapshot(), operation="subset") + _inherit_and_append(self, logged_result, msgs) + return logged_result + + def _inplace_subset(self, index): + pre = self._snapshot() + super()._inplace_subset(index) + _append_log_messages(self, _subset_messages(pre, self._snapshot(), operation="subset")) + + def _operation_log_block(self) -> str: + logs = self.uns.get(LOG_KEY, []) + if isinstance(logs, pd.DataFrame): + if logs.empty: + return "" + rows = logs.apply(lambda r: f"[{r['Time']}] {r['Operation']}: {r['Message']}", axis=1) + return "\n\nOperation log:\n" + "\n".join(rows) + if not logs: + return "" + return "\n\nOperation log:\n" + "\n".join(str(x) for x in logs) + + def __repr__(self) -> str: + return super().__repr__() + self._operation_log_block() + + def __str__(self) -> str: + return self.__repr__() + + def operation_log(self) -> list[str]: + return list(self.uns.get(LOG_KEY, [])) + + +def log_start(adata: ad.AnnData) -> LoggedAnnDataStandalone: + return LoggedAnnDataStandalone.from_anndata(adata) \ No newline at end of file diff --git a/renv.lock b/tests/__init__.py similarity index 100% rename from renv.lock rename to tests/__init__.py