From d8fe45cb065e6040db3c5ac9463662dbbb5ec919 Mon Sep 17 00:00:00 2001 From: jdhenaos Date: Tue, 19 May 2026 16:47:48 +0000 Subject: [PATCH 1/3] pypi folder format --- .devcontainer/docker-compose.yml | 29 -- .devcontainer/r/Dockerfile | 48 --- .devcontainer/r/devcontainer.json | 26 -- .gitignore | 13 +- analysis/py/adata_reference_code.ipynb | 243 ----------- analysis/py/omicslog_beta.ipynb | 553 ------------------------- analysis/py/smoke.py | 27 -- analysis/r/smoke.R | 26 -- analysis/r/template.qmd | 22 - pyproject.toml | 21 +- src/omicslog/__init__.py | 7 + src/omicslog/core.py | 313 ++++++++++++++ renv.lock => tests/__init__.py | 0 13 files changed, 338 insertions(+), 990 deletions(-) delete mode 100644 .devcontainer/docker-compose.yml delete mode 100644 .devcontainer/r/Dockerfile delete mode 100644 .devcontainer/r/devcontainer.json delete mode 100644 analysis/py/adata_reference_code.ipynb delete mode 100644 analysis/py/omicslog_beta.ipynb delete mode 100644 analysis/py/smoke.py delete mode 100644 analysis/r/smoke.R delete mode 100644 analysis/r/template.qmd create mode 100644 src/omicslog/__init__.py create mode 100644 src/omicslog/core.py rename renv.lock => tests/__init__.py (100%) diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml deleted file mode 100644 index 76f6ad3..0000000 --- a/.devcontainer/docker-compose.yml +++ /dev/null @@ -1,29 +0,0 @@ -version: "3.9" - -services: - r-dev: - build: - context: .. - dockerfile: .devcontainer/r/Dockerfile - volumes: - - ..:/workspaces/project:cached - - renv_cache:/opt/renv/cache - - ${SECURE_DATA_DIR:-}:/secure-data:ro - working_dir: /workspaces/project - command: sleep infinity - - py-dev: - build: - context: .. - dockerfile: .devcontainer/py/Dockerfile - volumes: - - ..:/workspaces/project:cached - - ${SECURE_DATA_DIR:-}:/secure-data:ro - working_dir: /workspaces/project - command: sleep infinity - -volumes: - renv_cache: - poetry_cache: - pip_cache: - diff --git a/.devcontainer/r/Dockerfile b/.devcontainer/r/Dockerfile deleted file mode 100644 index 6c2368b..0000000 --- a/.devcontainer/r/Dockerfile +++ /dev/null @@ -1,48 +0,0 @@ -# Pin R version here (adjust as needed) -FROM rocker/r-ver:4.4.2 - -ENV DEBIAN_FRONTEND=noninteractive - -# System deps commonly needed for Bioconductor + ggplot + compilation -RUN apt-get update && apt-get install -y --no-install-recommends \ - git \ - curl \ - wget \ - ca-certificates \ - locales \ - build-essential \ - pkg-config \ - libcurl4-openssl-dev \ - libssl-dev \ - libxml2-dev \ - libfontconfig1-dev \ - libfreetype6-dev \ - libpng-dev \ - libtiff5-dev \ - libjpeg-dev \ - libcairo2-dev \ - libharfbuzz-dev \ - libfribidi-dev \ - libglpk-dev \ - && rm -rf /var/lib/apt/lists/* - -# Locale (helps with some R packages / rendering) -RUN sed -i 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \ - locale-gen -ENV LANG=en_US.UTF-8 -ENV LC_ALL=en_US.UTF-8 - -# Install renv in the system library (project will still use renv.lock) -RUN R -q -e "install.packages('renv', repos='https://cloud.r-project.org')" - -# --- Quarto (pin version for reproducible rendering) --- -ENV QUARTO_VERSION=1.5.57 -RUN wget -q "https://github.com/quarto-dev/quarto-cli/releases/download/v${QUARTO_VERSION}/quarto-${QUARTO_VERSION}-linux-amd64.deb" \ - && dpkg -i "quarto-${QUARTO_VERSION}-linux-amd64.deb" \ - && rm -f "quarto-${QUARTO_VERSION}-linux-amd64.deb" - -# Optional: speed up / standardize renv cache location (persisted via volume) -ENV RENV_PATHS_CACHE=/opt/renv/cache - -WORKDIR /workspaces/project - diff --git a/.devcontainer/r/devcontainer.json b/.devcontainer/r/devcontainer.json deleted file mode 100644 index d28daad..0000000 --- a/.devcontainer/r/devcontainer.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "name": "project (R + renv)", - "dockerComposeFile": ["../docker-compose.yml"], - "service": "r-dev", - "workspaceFolder": "/workspaces/project", - "shutdownAction": "stopCompose", - - "customizations": { - "vscode": { - "extensions": [ - "REditorSupport.r", - "rdebugger.r-debugger", - "Ikuyadeu.r", - "ms-azuretools.vscode-docker", - "GitHub.copilot" - ], - "settings": { - "r.rterm.linux": "/usr/bin/R", - "r.bracketedPaste": true - } - } - }, - - "postCreateCommand": "bash -lc 'quarto --version && if [ -f renv.lock ]; then R -q -e \"renv::restore(prompt = FALSE)\"; else echo \"No renv.lock yet. Run: R -q -e \\\"renv::init()\\\"\"; fi'" -} - diff --git a/.gitignore b/.gitignore index b2f88fe..0d024df 100644 --- a/.gitignore +++ b/.gitignore @@ -8,11 +8,8 @@ __pycache__/ .pytest_cache/ .mypy_cache/ .ruff_cache/ - -# R -.Rhistory -.RData -.Rproj.user/ +dist/ +*.egg-info/ # Outputs results/ @@ -22,12 +19,6 @@ data/**/processed/ # OS .DS_Store -# Quarto -.quarto/ -_quarto/ -_site/ -_freeze/ - .devcontainer/.env .env config/local.env diff --git a/analysis/py/adata_reference_code.ipynb b/analysis/py/adata_reference_code.ipynb deleted file mode 100644 index cc84126..0000000 --- a/analysis/py/adata_reference_code.ipynb +++ /dev/null @@ -1,243 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "8fb259db", - "metadata": {}, - "source": [] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "dbb1493c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.12.10\n", - "AnnData object with n_obs × n_vars = 100 × 2000\n", - "Index(['Cell_0', 'Cell_1', 'Cell_2', 'Cell_3', 'Cell_4', 'Cell_5', 'Cell_6',\n", - " 'Cell_7', 'Cell_8', 'Cell_9'],\n", - " dtype='object')\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_45018/475607713.py:5: FutureWarning: `__version__` is deprecated, use `importlib.metadata.version('anndata')` instead.\n", - " print(ad.__version__)\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "import anndata as ad\n", - "from scipy.sparse import csr_matrix\n", - "print(ad.__version__)\n", - "\n", - "counts = csr_matrix(np.random.poisson(1, size=(100, 2000)), dtype=np.float32)\n", - "adata = ad.AnnData(counts)\n", - "print(adata)\n", - "\n", - "adata.obs_names = [f\"Cell_{i:d}\" for i in range(adata.n_obs)]\n", - "adata.var_names = [f\"Gene_{i:d}\" for i in range(adata.n_vars)]\n", - "print(adata.obs_names[:10])" - ] - }, - { - "cell_type": "markdown", - "id": "beed1340", - "metadata": {}, - "source": [ - "# Filtering by Cells" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "f0969e08", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "View of AnnData object with n_obs × n_vars = 2 × 2000" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "adata = adata[[\"Cell_1\", \"Cell_10\"], ]\n", - "adata" - ] - }, - { - "cell_type": "markdown", - "id": "3f9bfc1c", - "metadata": {}, - "source": [ - "# Filtering by cells (.obs)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "a0b17993", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_45018/910546738.py:2: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n", - " adata.obs[\"cell_type\"] = pd.Categorical(ct) # Categoricals are preferred for efficiency\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
cell_type
Cell_1B
Cell_10T
\n", - "
" - ], - "text/plain": [ - " cell_type\n", - "Cell_1 B\n", - "Cell_10 T" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ct = np.random.choice([\"B\", \"T\", \"Monocyte\"], size=(adata.n_obs,))\n", - "adata.obs[\"cell_type\"] = pd.Categorical(ct) # Categoricals are preferred for efficiency\n", - "adata.obs" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "c77a07da", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "View of AnnData object with n_obs × n_vars = 2 × 2000\n", - " obs: 'cell_type'" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "\n", - "adata = adata[adata.obs.cell_type == \"B\"]\n", - "adata" - ] - }, - { - "cell_type": "markdown", - "id": "a3c37b5a", - "metadata": {}, - "source": [ - "# Adding layers" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "fab07d95", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_44426/2818935080.py:1: ImplicitModificationWarning: Setting element `.layers['log_transformed']` of view, initializing view as actual.\n", - " adata.layers[\"log_transformed\"] = np.log1p(adata.X)\n" - ] - }, - { - "data": { - "text/plain": [ - "AnnData object with n_obs × n_vars = 2 × 2000\n", - " obs: 'cell_type'\n", - " layers: 'log_transformed'" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "adata.layers[\"log_transformed\"] = np.log1p(adata.X)\n", - "adata" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.14" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/analysis/py/omicslog_beta.ipynb b/analysis/py/omicslog_beta.ipynb deleted file mode 100644 index d194419..0000000 --- a/analysis/py/omicslog_beta.ipynb +++ /dev/null @@ -1,553 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "f0b85c1e", - "metadata": {}, - "outputs": [], - "source": [ - "from __future__ import annotations\n", - "\n", - "from copy import deepcopy\n", - "from dataclasses import dataclass, field\n", - "from datetime import datetime\n", - "from typing import Any\n", - "\n", - "import anndata as ad\n", - "\n", - "LOG_KEY = \"_omicslog\"\n", - "\n", - "\n", - "def _timestamp() -> str:\n", - " return datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\")\n", - "\n", - "\n", - "def _format_log_message(operation: str, message: str, ts: str | None = None) -> str:\n", - " stamp = ts or _timestamp()\n", - " return f\"[{stamp}] {operation}: {message}\"\n", - "\n", - "\n", - "def _ensure_log_container(adata: ad.AnnData) -> list[str]:\n", - " current = adata.uns.get(LOG_KEY)\n", - " if not isinstance(current, list):\n", - " adata.uns[LOG_KEY] = []\n", - " return adata.uns[LOG_KEY]\n", - "\n", - "\n", - "def _append_log_messages(adata: ad.AnnData, messages: list[str] | tuple[str, ...]) -> None:\n", - " if not messages:\n", - " return\n", - " _ensure_log_container(adata).extend(messages)\n", - "\n", - "\n", - "def _inherit_and_append(\n", - " parent: ad.AnnData,\n", - " child: ad.AnnData,\n", - " messages: list[str] | tuple[str, ...],\n", - ") -> None:\n", - " child.uns[LOG_KEY] = list(_ensure_log_container(parent))\n", - " _append_log_messages(child, messages)\n", - "\n", - "def _parent_set(obj, attr: str, value) -> None:\n", - " \"\"\"Set an attribute via the first parent class that defines it.\n", - " Works for both plain properties (.fset) and custom descriptors (.__set__).\n", - " \"\"\"\n", - " for base in type(obj).__mro__[1:]:\n", - " if attr in base.__dict__:\n", - " base.__dict__[attr].__set__(obj, value)\n", - " return\n", - " object.__setattr__(obj, attr, value)\n", - "\n", - "class _LoggingProxy:\n", - " \"\"\"\n", - " Transparent proxy for dict-like AnnData components (layers, obsm, varm, ...).\n", - " Intercepts __setitem__ and __delitem__ to log mutations automatically.\n", - " \"\"\"\n", - "\n", - " def __init__(self, wrapped, owner: \"LoggedAnnDataStandalone\", label: str):\n", - " object.__setattr__(self, \"_w\", wrapped)\n", - " object.__setattr__(self, \"_owner\", owner)\n", - " object.__setattr__(self, \"_label\", label)\n", - "\n", - " def __setitem__(self, key: str, value) -> None:\n", - " w = object.__getattribute__(self, \"_w\")\n", - " owner = object.__getattribute__(self, \"_owner\")\n", - " label = object.__getattribute__(self, \"_label\")\n", - " verb = \"updated\" if key in w else \"added\"\n", - " w[key] = value\n", - " _append_log_messages(owner, [_format_log_message(label, f\"'{key}' {verb}\")])\n", - "\n", - " def __delitem__(self, key: str) -> None:\n", - " w = object.__getattribute__(self, \"_w\")\n", - " owner = object.__getattribute__(self, \"_owner\")\n", - " label = object.__getattribute__(self, \"_label\")\n", - " del w[key]\n", - " _append_log_messages(owner, [_format_log_message(label, f\"'{key}' removed\")])\n", - "\n", - " def __getitem__(self, key):\n", - " return object.__getattribute__(self, \"_w\")[key]\n", - "\n", - " def __getattr__(self, name):\n", - " return getattr(object.__getattribute__(self, \"_w\"), name)\n", - "\n", - " def __contains__(self, key):\n", - " return key in object.__getattribute__(self, \"_w\")\n", - "\n", - " def __iter__(self):\n", - " return iter(object.__getattribute__(self, \"_w\"))\n", - "\n", - " def __len__(self):\n", - " return len(object.__getattribute__(self, \"_w\"))\n", - "\n", - " def __repr__(self):\n", - " return repr(object.__getattribute__(self, \"_w\"))\n", - "\n", - "\n", - "@dataclass\n", - "class AnnDataSnapshot:\n", - " \"\"\"Captures the full state of an AnnData object for diffing.\"\"\"\n", - " n_obs: int\n", - " n_vars: int\n", - " obs_cols: list[str] = field(default_factory=list)\n", - " var_cols: list[str] = field(default_factory=list)\n", - " layers: list[str] = field(default_factory=list)\n", - " obsm: list[str] = field(default_factory=list)\n", - " varm: list[str] = field(default_factory=list)\n", - " obsp: list[str] = field(default_factory=list)\n", - " varp: list[str] = field(default_factory=list)\n", - "\n", - " @classmethod\n", - " def from_anndata(cls, adata: ad.AnnData) -> \"AnnDataSnapshot\":\n", - " return cls(\n", - " n_obs=adata.n_obs,\n", - " n_vars=adata.n_vars,\n", - " obs_cols=list(adata.obs.columns),\n", - " var_cols=list(adata.var.columns),\n", - " layers=list(adata.layers.keys()),\n", - " obsm=list(adata.obsm.keys()),\n", - " varm=list(adata.varm.keys()),\n", - " obsp=list(adata.obsp.keys()),\n", - " varp=list(adata.varp.keys()),\n", - " )\n", - "\n", - "\n", - "def _diff_keys(\n", - " pre: list[str],\n", - " post: list[str],\n", - " label: str,\n", - " operation: str,\n", - " ts: str,\n", - ") -> list[str]:\n", - " msgs = []\n", - " for k in sorted(set(pre) - set(post)):\n", - " msgs.append(_format_log_message(operation, f\"{label} removed: '{k}'\", ts))\n", - " for k in sorted(set(post) - set(pre)):\n", - " msgs.append(_format_log_message(operation, f\"{label} added: '{k}'\", ts))\n", - " return msgs\n", - "\n", - "\n", - "def _subset_messages(\n", - " pre: AnnDataSnapshot,\n", - " post: AnnDataSnapshot,\n", - " operation: str = \"subset\",\n", - ") -> list[str]:\n", - " msgs: list[str] = []\n", - " ts = _timestamp()\n", - "\n", - " if pre.n_vars != post.n_vars:\n", - " removed = pre.n_vars - post.n_vars\n", - " pct = round((removed / pre.n_vars) * 100) if pre.n_vars else 0\n", - " msgs.append(_format_log_message(\n", - " operation,\n", - " f\"removed {removed} genes ({pct}%), {post.n_vars} genes remaining\",\n", - " ts,\n", - " ))\n", - "\n", - " if pre.n_obs != post.n_obs:\n", - " removed = pre.n_obs - post.n_obs\n", - " pct = round((removed / pre.n_obs) * 100) if pre.n_obs else 0\n", - " msgs.append(_format_log_message(\n", - " operation,\n", - " f\"removed {removed} samples ({pct}%), {post.n_obs} samples remaining\",\n", - " ts,\n", - " ))\n", - "\n", - " msgs += _diff_keys(pre.obs_cols, post.obs_cols, \"obs column\", operation, ts)\n", - " msgs += _diff_keys(pre.var_cols, post.var_cols, \"var column\", operation, ts)\n", - " msgs += _diff_keys(pre.layers, post.layers, \"layer\", operation, ts)\n", - " msgs += _diff_keys(pre.obsm, post.obsm, \"obsm\", operation, ts)\n", - " msgs += _diff_keys(pre.varm, post.varm, \"varm\", operation, ts)\n", - " msgs += _diff_keys(pre.obsp, post.obsp, \"obsp\", operation, ts)\n", - " msgs += _diff_keys(pre.varp, post.varp, \"varp\", operation, ts)\n", - "\n", - " return msgs\n", - "\n", - "\n", - "class LoggedAnnDataStandalone(ad.AnnData):\n", - " \"\"\"Standalone subclass strategy with local logging helpers and message style.\"\"\"\n", - "\n", - " def __init__(self, *args: Any, **kwargs: Any):\n", - " super().__init__(*args, **kwargs)\n", - " _ensure_log_container(self)\n", - "\n", - " @classmethod\n", - " def _safe_component_copy(cls, value):\n", - " return value.copy() if hasattr(value, \"copy\") else deepcopy(value)\n", - "\n", - " @classmethod\n", - " def from_anndata(cls, adata: ad.AnnData) -> \"LoggedAnnDataStandalone\":\n", - " if isinstance(adata, cls):\n", - " _ensure_log_container(adata)\n", - " return adata\n", - "\n", - " kwargs: dict[str, Any] = {\n", - " \"X\": cls._safe_component_copy(adata.X) if adata.X is not None else None,\n", - " \"obs\": adata.obs.copy(),\n", - " \"var\": adata.var.copy(),\n", - " \"uns\": deepcopy(dict(adata.uns)),\n", - " \"obsm\": {k: cls._safe_component_copy(v) for k, v in adata.obsm.items()},\n", - " \"varm\": {k: cls._safe_component_copy(v) for k, v in adata.varm.items()},\n", - " \"layers\": {k: cls._safe_component_copy(v) for k, v in adata.layers.items()},\n", - " \"obsp\": {k: cls._safe_component_copy(v) for k, v in adata.obsp.items()},\n", - " \"varp\": {k: cls._safe_component_copy(v) for k, v in adata.varp.items()},\n", - " }\n", - " if adata.raw is not None:\n", - " kwargs[\"raw\"] = {\n", - " \"X\": cls._safe_component_copy(adata.raw.X),\n", - " \"var\": adata.raw.var.copy(),\n", - " \"varm\": {k: cls._safe_component_copy(v) for k, v in adata.raw.varm.items()},\n", - " }\n", - "\n", - " logged = cls(**kwargs)\n", - " _ensure_log_container(logged)\n", - " return logged\n", - "\n", - " # --- proxied properties: each needs a getter AND a setter ---\n", - "\n", - " @property\n", - " def layers(self):\n", - " return _LoggingProxy(super().layers, self, \"layers\")\n", - "\n", - " @layers.setter\n", - " def layers(self, value):\n", - " _parent_set(self, \"layers\", value)\n", - "\n", - " @property\n", - " def obsm(self):\n", - " return _LoggingProxy(super().obsm, self, \"obsm\")\n", - "\n", - " @obsm.setter\n", - " def obsm(self, value):\n", - " _parent_set(self, \"obsm\", value)\n", - "\n", - " @property\n", - " def varm(self):\n", - " return _LoggingProxy(super().varm, self, \"varm\")\n", - "\n", - " @varm.setter\n", - " def varm(self, value):\n", - " _parent_set(self, \"varm\", value)\n", - "\n", - " @property\n", - " def obsp(self):\n", - " return _LoggingProxy(super().obsp, self, \"obsp\")\n", - "\n", - " @obsp.setter\n", - " def obsp(self, value):\n", - " _parent_set(self, \"obsp\", value)\n", - "\n", - " @property\n", - " def varp(self):\n", - " return _LoggingProxy(super().varp, self, \"varp\")\n", - "\n", - " @varp.setter\n", - " def varp(self, value):\n", - " _parent_set(self, \"varp\", value)\n", - "\n", - " @property\n", - " def obs(self):\n", - " return _LoggingProxy(super().obs, self, \"obs\")\n", - "\n", - " @obs.setter\n", - " def obs(self, value):\n", - " _parent_set(self, \"obs\", value)\n", - "\n", - " @property\n", - " def var(self):\n", - " return _LoggingProxy(super().var, self, \"var\")\n", - "\n", - " @var.setter\n", - " def var(self, value):\n", - " ad.AnnData.var.fset(self, value)\n", - "\n", - " # --- snapshot & subsetting ---\n", - "\n", - " def _snapshot(self) -> AnnDataSnapshot:\n", - " return AnnDataSnapshot.from_anndata(self)\n", - "\n", - " def __getitem__(self, index):\n", - " pre = self._snapshot()\n", - " result = super().__getitem__(index)\n", - " logged_result = self.from_anndata(result)\n", - " msgs = _subset_messages(pre, logged_result._snapshot(), operation=\"subset\")\n", - " _inherit_and_append(self, logged_result, msgs)\n", - " return logged_result\n", - "\n", - " def _inplace_subset(self, index):\n", - " pre = self._snapshot()\n", - " super()._inplace_subset(index)\n", - " _append_log_messages(self, _subset_messages(pre, self._snapshot(), operation=\"subset\"))\n", - "\n", - " def _operation_log_block(self) -> str:\n", - " logs = self.uns.get(LOG_KEY, [])\n", - " if not logs:\n", - " return \"\"\n", - " return \"\\n\\nOperation log:\\n\" + \"\\n\".join(str(x) for x in logs)\n", - "\n", - " def __repr__(self) -> str:\n", - " return super().__repr__() + self._operation_log_block()\n", - "\n", - " def __str__(self) -> str:\n", - " return self.__repr__()\n", - "\n", - " def operation_log(self) -> list[str]:\n", - " return list(self.uns.get(LOG_KEY, []))\n", - "\n", - "\n", - "def log_start(adata: ad.AnnData) -> LoggedAnnDataStandalone:\n", - " return LoggedAnnDataStandalone.from_anndata(adata)" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "a256216a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.12.10\n", - "AnnData object with n_obs × n_vars = 100 × 2000\n", - " uns: '_omicslog'\n", - "AnnData object with n_obs × n_vars = 100 × 2000\n", - " obs: 'cell_type'\n", - " uns: '_omicslog'\n", - "\n", - "Operation log:\n", - "[2026-03-27 16:38:57] obs: 'cell_type' added\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_785154/2927559619.py:5: FutureWarning: `__version__` is deprecated, use `importlib.metadata.version('anndata')` instead.\n", - " print(ad.__version__)\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "import anndata as ad\n", - "from scipy.sparse import csr_matrix\n", - "print(ad.__version__)\n", - "\n", - "counts = csr_matrix(np.random.poisson(1, size=(100, 2000)), dtype=np.float32)\n", - "adata = ad.AnnData(counts)\n", - "\n", - "adata.obs_names = [f\"Cell_{i:d}\" for i in range(adata.n_obs)]\n", - "adata.var_names = [f\"Gene_{i:d}\" for i in range(adata.n_vars)]\n", - "\n", - "logdata = log_start(adata)\n", - "print(logdata)\n", - "\n", - "ct = np.random.choice([\"B\", \"T\", \"Monocyte\"], size=(logdata.n_obs,))\n", - "logdata.obs[\"cell_type\"] = pd.Categorical(ct) # Categoricals are preferred for efficiency\n", - "print(logdata)" - ] - }, - { - "cell_type": "markdown", - "id": "8e0e24d1", - "metadata": {}, - "source": [ - "# Fltrating by Cells (.obs)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "44d8de87", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AnnData object with n_obs × n_vars = 31 × 2000\n", - " obs: 'cell_type'\n", - " uns: '_omicslog'\n", - "\n", - "Operation log:\n", - "[2026-03-27 16:38:57] obs: 'cell_type' added\n", - "[2026-03-27 16:38:57] subset: removed 69 samples (69%), 31 samples remaining" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "logdata = logdata[logdata.obs.cell_type == \"B\"]\n", - "logdata" - ] - }, - { - "cell_type": "markdown", - "id": "0f4ac517", - "metadata": {}, - "source": [ - "# Filtering by Genes (.var)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "af577b51", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AnnData object with n_obs × n_vars = 31 × 200\n", - " obs: 'cell_type'\n", - " uns: '_omicslog'\n", - "\n", - "Operation log:\n", - "[2026-03-27 16:38:57] obs: 'cell_type' added\n", - "[2026-03-27 16:38:57] subset: removed 69 samples (69%), 31 samples remaining\n", - "[2026-03-27 16:38:57] subset: removed 1800 genes (90%), 200 genes remaining" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "logdata = logdata[:,logdata.var_names.str.endswith(\"1\")]\n", - "logdata" - ] - }, - { - "cell_type": "markdown", - "id": "f1a77680", - "metadata": {}, - "source": [ - "# Adding observatons and variables" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "70952826", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AnnData object with n_obs × n_vars = 31 × 200\n", - " obs: 'cell_type'\n", - " uns: '_omicslog'\n", - " obsm: 'X_umap'\n", - " varm: 'gene_stuff'\n", - "\n", - "Operation log:\n", - "[2026-03-27 16:38:57] obs: 'cell_type' added\n", - "[2026-03-27 16:38:57] subset: removed 69 samples (69%), 31 samples remaining\n", - "[2026-03-27 16:38:57] subset: removed 1800 genes (90%), 200 genes remaining\n", - "[2026-03-27 16:38:57] obsm: 'X_umap' added\n", - "[2026-03-27 16:38:57] varm: 'gene_stuff' added" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "logdata.obsm[\"X_umap\"] = np.random.normal(0, 1, size=(logdata.n_obs, 2))\n", - "logdata.varm[\"gene_stuff\"] = np.random.normal(0, 1, size=(logdata.n_vars, 5))\n", - "logdata" - ] - }, - { - "cell_type": "markdown", - "id": "d9a5c3b4", - "metadata": {}, - "source": [ - "# Adding layers" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "529860ba", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AnnData object with n_obs × n_vars = 31 × 200\n", - " obs: 'cell_type'\n", - " uns: '_omicslog'\n", - " obsm: 'X_umap'\n", - " varm: 'gene_stuff'\n", - " layers: 'log_transformed'\n", - "\n", - "Operation log:\n", - "[2026-03-27 16:38:57] obs: 'cell_type' added\n", - "[2026-03-27 16:38:57] subset: removed 69 samples (69%), 31 samples remaining\n", - "[2026-03-27 16:38:57] subset: removed 1800 genes (90%), 200 genes remaining\n", - "[2026-03-27 16:38:57] obsm: 'X_umap' added\n", - "[2026-03-27 16:38:57] varm: 'gene_stuff' added\n", - "[2026-03-27 16:38:57] layers: 'log_transformed' added" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "logdata.layers[\"log_transformed\"] = np.log1p(logdata.X)\n", - "logdata" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.14" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/analysis/py/smoke.py b/analysis/py/smoke.py deleted file mode 100644 index eb17e7b..0000000 --- a/analysis/py/smoke.py +++ /dev/null @@ -1,27 +0,0 @@ -import sys -from pathlib import Path - -def main() -> None: - print("Python:", sys.version) - - # Minimal deps: numpy + sklearn (small, common). Add torch as needed. - import numpy as np # noqa: F401 - from sklearn.linear_model import LogisticRegression - - X = np.random.randn(100, 5) - y = (X[:, 0] + 0.1 * X[:, 1] > 0).astype(int) - - clf = LogisticRegression(max_iter=200) - clf.fit(X, y) - acc = clf.score(X, y) - - out = Path("results") - out.mkdir(exist_ok=True) - (out / "smoke_metrics.txt").write_text(f"train_acc={acc:.3f}\n") - - print("Python smoke test completed; wrote results/smoke_metrics.txt") - - -if __name__ == "__main__": - main() - diff --git a/analysis/r/smoke.R b/analysis/r/smoke.R deleted file mode 100644 index fdc5fb0..0000000 --- a/analysis/r/smoke.R +++ /dev/null @@ -1,26 +0,0 @@ -# Minimal smoke test for R container -message("R version: ", R.version.string) - -# Ensure renv works -if (!requireNamespace("renv", quietly = TRUE)) { - stop("renv not installed") -} - -# Optional: install a tiny plotting stack in project env if not present -pkgs <- c("ggplot2") -missing <- pkgs[!vapply(pkgs, requireNamespace, logical(1), quietly = TRUE)] -if (length(missing) > 0) { - message("Installing missing packages: ", paste(missing, collapse = ", ")) - install.packages(missing, repos = "https://cloud.r-project.org") -} - -library(ggplot2) - -df <- data.frame(x = 1:10, y = (1:10)^2) -p <- ggplot(df, aes(x, y)) + geom_point() + ggtitle("Smoke test plot") - -dir.create("results", showWarnings = FALSE) -ggsave("results/smoke_plot.png", p, width = 6, height = 4, dpi = 150) - -message("R smoke test completed; wrote results/smoke_plot.png") - diff --git a/analysis/r/template.qmd b/analysis/r/template.qmd deleted file mode 100644 index e71273b..0000000 --- a/analysis/r/template.qmd +++ /dev/null @@ -1,22 +0,0 @@ ---- -title: "" -author: Juan Henao -date: '`r format(Sys.time(), "%d %B, %Y")`' -description: "" -title-block-banner: "black" -quarto: - components: - panel-tabset: - max_items: 10 -format: - html: - embed-resources: true - smooth-scroll: true - anchor-sections: true - number-sections: true - toc: true - toc-location: left - code-fold: true - theme: cerulean -editor: visual ---- diff --git a/pyproject.toml b/pyproject.toml index fdd9235..cf82774 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,20 +1,32 @@ [tool.poetry] -name = "project" +name = "omicslog" version = "0.1.0" -description = "CompBio template: Python (Poetry) + R (renv) in separate devcontainers" +description = "AnnData subclass that automatically logs mutations to .uns['_omicslog']" authors = ["Your Name "] +license = "MIT" readme = "README.md" -package-mode = false +homepage = "https://github.com/your-org/omicslog" +repository = "https://github.com/your-org/omicslog" +keywords = ["bioinformatics", "single-cell", "anndata", "logging"] +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Bio-Informatics", +] +packages = [{include = "omicslog", from = "src"}] [tool.poetry.dependencies] python = ">=3.11,<3.12" +anndata = ">=0.10" numpy = "^2.0.0" -scikit-learn = "^1.5.0" [tool.poetry.group.dev.dependencies] ruff = "^0.6.0" black = "^24.8.0" ipykernel = "^7.2.0" +pytest = "^8.0.0" +scipy = "^1.14.0" [build-system] requires = ["poetry-core"] @@ -25,4 +37,3 @@ line-length = 100 [tool.black] line-length = 100 - diff --git a/src/omicslog/__init__.py b/src/omicslog/__init__.py new file mode 100644 index 0000000..71c626b --- /dev/null +++ b/src/omicslog/__init__.py @@ -0,0 +1,7 @@ +from omicslog.core import ( + LoggedAnnDataStandalone, + AnnDataSnapshot, + log_start, +) + +__all__ = ["LoggedAnnDataStandalone", "AnnDataSnapshot", "log_start"] diff --git a/src/omicslog/core.py b/src/omicslog/core.py new file mode 100644 index 0000000..8cd4872 --- /dev/null +++ b/src/omicslog/core.py @@ -0,0 +1,313 @@ +from __future__ import annotations + +from copy import deepcopy +from dataclasses import dataclass, field +from datetime import datetime +from typing import Any + +import anndata as ad + +LOG_KEY = "_omicslog" + + +def _timestamp() -> str: + return datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + +def _format_log_message(operation: str, message: str, ts: str | None = None) -> str: + stamp = ts or _timestamp() + return f"[{stamp}] {operation}: {message}" + + +def _ensure_log_container(adata: ad.AnnData) -> list[str]: + current = adata.uns.get(LOG_KEY) + if not isinstance(current, list): + adata.uns[LOG_KEY] = [] + return adata.uns[LOG_KEY] + + +def _append_log_messages(adata: ad.AnnData, messages: list[str] | tuple[str, ...]) -> None: + if not messages: + return + _ensure_log_container(adata).extend(messages) + + +def _inherit_and_append( + parent: ad.AnnData, + child: ad.AnnData, + messages: list[str] | tuple[str, ...], +) -> None: + child.uns[LOG_KEY] = list(_ensure_log_container(parent)) + _append_log_messages(child, messages) + + +def _parent_set(obj: Any, attr: str, value: Any) -> None: + """Set an attribute via the first parent class that defines it. + Works for both plain properties (.fset) and custom descriptors (.__set__). + """ + for base in type(obj).__mro__[1:]: + if attr in base.__dict__: + base.__dict__[attr].__set__(obj, value) + return + object.__setattr__(obj, attr, value) + + +class _LoggingProxy: + """Transparent proxy for dict-like AnnData components. + Intercepts __setitem__ and __delitem__ to log mutations automatically. + """ + + def __init__(self, wrapped: Any, owner: "LoggedAnnDataStandalone", label: str): + object.__setattr__(self, "_w", wrapped) + object.__setattr__(self, "_owner", owner) + object.__setattr__(self, "_label", label) + + def __setitem__(self, key: str, value: Any) -> None: + w = object.__getattribute__(self, "_w") + owner = object.__getattribute__(self, "_owner") + label = object.__getattribute__(self, "_label") + verb = "updated" if key in w else "added" + w[key] = value + _append_log_messages(owner, [_format_log_message(label, f"'{key}' {verb}")]) + + def __delitem__(self, key: str) -> None: + w = object.__getattribute__(self, "_w") + owner = object.__getattribute__(self, "_owner") + label = object.__getattribute__(self, "_label") + del w[key] + _append_log_messages(owner, [_format_log_message(label, f"'{key}' removed")]) + + def __getitem__(self, key: Any) -> Any: + return object.__getattribute__(self, "_w")[key] + + def __getattr__(self, name: str) -> Any: + return getattr(object.__getattribute__(self, "_w"), name) + + def __contains__(self, key: Any) -> bool: + return key in object.__getattribute__(self, "_w") + + def __iter__(self): + return iter(object.__getattribute__(self, "_w")) + + def __len__(self) -> int: + return len(object.__getattribute__(self, "_w")) + + def __repr__(self) -> str: + return repr(object.__getattribute__(self, "_w")) + + +@dataclass +class AnnDataSnapshot: + """Captures the key state of an AnnData object for diffing.""" + + n_obs: int + n_vars: int + obs_cols: list[str] = field(default_factory=list) + var_cols: list[str] = field(default_factory=list) + layers: list[str] = field(default_factory=list) + obsm: list[str] = field(default_factory=list) + varm: list[str] = field(default_factory=list) + obsp: list[str] = field(default_factory=list) + varp: list[str] = field(default_factory=list) + + @classmethod + def from_anndata(cls, adata: ad.AnnData) -> "AnnDataSnapshot": + return cls( + n_obs=adata.n_obs, + n_vars=adata.n_vars, + obs_cols=list(adata.obs.columns), + var_cols=list(adata.var.columns), + layers=list(adata.layers.keys()), + obsm=list(adata.obsm.keys()), + varm=list(adata.varm.keys()), + obsp=list(adata.obsp.keys()), + varp=list(adata.varp.keys()), + ) + + +def _diff_keys( + pre: list[str], + post: list[str], + label: str, + operation: str, + ts: str, +) -> list[str]: + msgs = [] + for k in sorted(set(pre) - set(post)): + msgs.append(_format_log_message(operation, f"{label} removed: '{k}'", ts)) + for k in sorted(set(post) - set(pre)): + msgs.append(_format_log_message(operation, f"{label} added: '{k}'", ts)) + return msgs + + +def _subset_messages( + pre: AnnDataSnapshot, + post: AnnDataSnapshot, + operation: str = "subset", +) -> list[str]: + msgs: list[str] = [] + ts = _timestamp() + + if pre.n_vars != post.n_vars: + removed = pre.n_vars - post.n_vars + pct = round((removed / pre.n_vars) * 100) if pre.n_vars else 0 + msgs.append(_format_log_message( + operation, + f"removed {removed} genes ({pct}%), {post.n_vars} genes remaining", + ts, + )) + + if pre.n_obs != post.n_obs: + removed = pre.n_obs - post.n_obs + pct = round((removed / pre.n_obs) * 100) if pre.n_obs else 0 + msgs.append(_format_log_message( + operation, + f"removed {removed} samples ({pct}%), {post.n_obs} samples remaining", + ts, + )) + + msgs += _diff_keys(pre.obs_cols, post.obs_cols, "obs column", operation, ts) + msgs += _diff_keys(pre.var_cols, post.var_cols, "var column", operation, ts) + msgs += _diff_keys(pre.layers, post.layers, "layer", operation, ts) + msgs += _diff_keys(pre.obsm, post.obsm, "obsm", operation, ts) + msgs += _diff_keys(pre.varm, post.varm, "varm", operation, ts) + msgs += _diff_keys(pre.obsp, post.obsp, "obsp", operation, ts) + msgs += _diff_keys(pre.varp, post.varp, "varp", operation, ts) + + return msgs + + +class LoggedAnnDataStandalone(ad.AnnData): + """AnnData subclass that automatically logs mutations to .uns['_omicslog'].""" + + def __init__(self, *args: Any, **kwargs: Any): + super().__init__(*args, **kwargs) + _ensure_log_container(self) + + @classmethod + def _safe_component_copy(cls, value: Any) -> Any: + return value.copy() if hasattr(value, "copy") else deepcopy(value) + + @classmethod + def from_anndata(cls, adata: ad.AnnData) -> "LoggedAnnDataStandalone": + if isinstance(adata, cls): + _ensure_log_container(adata) + return adata + + kwargs: dict[str, Any] = { + "X": cls._safe_component_copy(adata.X) if adata.X is not None else None, + "obs": adata.obs.copy(), + "var": adata.var.copy(), + "uns": deepcopy(dict(adata.uns)), + "obsm": {k: cls._safe_component_copy(v) for k, v in adata.obsm.items()}, + "varm": {k: cls._safe_component_copy(v) for k, v in adata.varm.items()}, + "layers": {k: cls._safe_component_copy(v) for k, v in adata.layers.items()}, + "obsp": {k: cls._safe_component_copy(v) for k, v in adata.obsp.items()}, + "varp": {k: cls._safe_component_copy(v) for k, v in adata.varp.items()}, + } + if adata.raw is not None: + kwargs["raw"] = { + "X": cls._safe_component_copy(adata.raw.X), + "var": adata.raw.var.copy(), + "varm": {k: cls._safe_component_copy(v) for k, v in adata.raw.varm.items()}, + } + + logged = cls(**kwargs) + _ensure_log_container(logged) + return logged + + # --- proxied properties: intercept direct dict-style mutations --- + + @property + def layers(self): + return _LoggingProxy(super().layers, self, "layers") + + @layers.setter + def layers(self, value: Any) -> None: + _parent_set(self, "layers", value) + + @property + def obsm(self): + return _LoggingProxy(super().obsm, self, "obsm") + + @obsm.setter + def obsm(self, value: Any) -> None: + _parent_set(self, "obsm", value) + + @property + def varm(self): + return _LoggingProxy(super().varm, self, "varm") + + @varm.setter + def varm(self, value: Any) -> None: + _parent_set(self, "varm", value) + + @property + def obsp(self): + return _LoggingProxy(super().obsp, self, "obsp") + + @obsp.setter + def obsp(self, value: Any) -> None: + _parent_set(self, "obsp", value) + + @property + def varp(self): + return _LoggingProxy(super().varp, self, "varp") + + @varp.setter + def varp(self, value: Any) -> None: + _parent_set(self, "varp", value) + + @property + def obs(self): + return _LoggingProxy(super().obs, self, "obs") + + @obs.setter + def obs(self, value: Any) -> None: + _parent_set(self, "obs", value) + + @property + def var(self): + return _LoggingProxy(super().var, self, "var") + + @var.setter + def var(self, value: Any) -> None: + _parent_set(self, "var", value) + + # --- snapshot & subsetting --- + + def _snapshot(self) -> AnnDataSnapshot: + return AnnDataSnapshot.from_anndata(self) + + def __getitem__(self, index: Any) -> "LoggedAnnDataStandalone": + pre = self._snapshot() + result = super().__getitem__(index) + logged_result = self.from_anndata(result) + msgs = _subset_messages(pre, logged_result._snapshot(), operation="subset") + _inherit_and_append(self, logged_result, msgs) + return logged_result + + def _inplace_subset(self, index: Any) -> None: + pre = self._snapshot() + super()._inplace_subset(index) + _append_log_messages(self, _subset_messages(pre, self._snapshot(), operation="subset")) + + def _operation_log_block(self) -> str: + logs = self.uns.get(LOG_KEY, []) + if not logs: + return "" + return "\n\nOperation log:\n" + "\n".join(str(x) for x in logs) + + def __repr__(self) -> str: + return super().__repr__() + self._operation_log_block() + + def __str__(self) -> str: + return self.__repr__() + + def operation_log(self) -> list[str]: + return list(self.uns.get(LOG_KEY, [])) + + +def log_start(adata: ad.AnnData) -> LoggedAnnDataStandalone: + """Convert an AnnData object into a LoggedAnnDataStandalone instance.""" + return LoggedAnnDataStandalone.from_anndata(adata) diff --git a/renv.lock b/tests/__init__.py similarity index 100% rename from renv.lock rename to tests/__init__.py From 75fcf8da2a17711b391ab3010c78f2dc8404e084 Mon Sep 17 00:00:00 2001 From: jdhenaos Date: Wed, 20 May 2026 12:20:13 +0000 Subject: [PATCH 2/3] data frame creator, TODO: fix overwrite --- demo.ipynb | 509 +++++++++++++++++++++++++++++++++++++++++++ src/omicslog/core.py | 86 +++++--- 2 files changed, 559 insertions(+), 36 deletions(-) create mode 100644 demo.ipynb diff --git a/demo.ipynb b/demo.ipynb new file mode 100644 index 0000000..d47097f --- /dev/null +++ b/demo.ipynb @@ -0,0 +1,509 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 6, + "id": "d1a1434b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Obtaining file:///workspaces/project\n", + " Installing build dependencies ... \u001b[?25ldone\n", + "\u001b[?25h Checking if build backend supports build_editable ... \u001b[?25ldone\n", + "\u001b[?25h Getting requirements to build editable ... \u001b[?25ldone\n", + "\u001b[?25h Preparing editable metadata (pyproject.toml) ... \u001b[?25ldone\n", + "\u001b[?25hRequirement already satisfied: anndata>=0.10 in ./.venv/lib/python3.11/site-packages (from omicslog==0.1.0) (0.12.10)\n", + "Requirement already satisfied: numpy<3.0.0,>=2.0.0 in ./.venv/lib/python3.11/site-packages (from omicslog==0.1.0) (2.3.5)\n", + "Requirement already satisfied: array-api-compat>=1.7.1 in ./.venv/lib/python3.11/site-packages (from anndata>=0.10->omicslog==0.1.0) (1.13.0)\n", + "Requirement already satisfied: h5py>=3.8 in ./.venv/lib/python3.11/site-packages (from anndata>=0.10->omicslog==0.1.0) (3.15.1)\n", + "Requirement already satisfied: legacy-api-wrap in ./.venv/lib/python3.11/site-packages (from anndata>=0.10->omicslog==0.1.0) (1.5)\n", + "Requirement already satisfied: natsort in ./.venv/lib/python3.11/site-packages (from anndata>=0.10->omicslog==0.1.0) (8.4.0)\n", + "Requirement already satisfied: packaging>=24.2 in ./.venv/lib/python3.11/site-packages (from anndata>=0.10->omicslog==0.1.0) (26.0)\n", + "Requirement already satisfied: pandas!=2.1.2,<3,>=2.1.0 in ./.venv/lib/python3.11/site-packages (from anndata>=0.10->omicslog==0.1.0) (2.3.3)\n", + "Requirement already satisfied: scipy!=1.17.0,>=1.12 in ./.venv/lib/python3.11/site-packages (from anndata>=0.10->omicslog==0.1.0) (1.16.3)\n", + "Requirement already satisfied: zarr!=3.0.*,>=2.18.7 in ./.venv/lib/python3.11/site-packages (from anndata>=0.10->omicslog==0.1.0) (3.1.5)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in ./.venv/lib/python3.11/site-packages (from pandas!=2.1.2,<3,>=2.1.0->anndata>=0.10->omicslog==0.1.0) (2.9.0.post0)\n", + "Requirement already satisfied: pytz>=2020.1 in ./.venv/lib/python3.11/site-packages (from pandas!=2.1.2,<3,>=2.1.0->anndata>=0.10->omicslog==0.1.0) (2025.2)\n", + "Requirement already satisfied: tzdata>=2022.7 in ./.venv/lib/python3.11/site-packages (from pandas!=2.1.2,<3,>=2.1.0->anndata>=0.10->omicslog==0.1.0) (2025.3)\n", + "Requirement already satisfied: six>=1.5 in ./.venv/lib/python3.11/site-packages (from python-dateutil>=2.8.2->pandas!=2.1.2,<3,>=2.1.0->anndata>=0.10->omicslog==0.1.0) (1.17.0)\n", + "Requirement already satisfied: donfig>=0.8 in ./.venv/lib/python3.11/site-packages (from zarr!=3.0.*,>=2.18.7->anndata>=0.10->omicslog==0.1.0) (0.8.1.post1)\n", + "Requirement already satisfied: google-crc32c>=1.5 in ./.venv/lib/python3.11/site-packages (from zarr!=3.0.*,>=2.18.7->anndata>=0.10->omicslog==0.1.0) (1.8.0)\n", + "Requirement already satisfied: numcodecs>=0.14 in ./.venv/lib/python3.11/site-packages (from zarr!=3.0.*,>=2.18.7->anndata>=0.10->omicslog==0.1.0) (0.16.5)\n", + "Requirement already satisfied: typing-extensions>=4.9 in ./.venv/lib/python3.11/site-packages (from zarr!=3.0.*,>=2.18.7->anndata>=0.10->omicslog==0.1.0) (4.15.0)\n", + "Requirement already satisfied: pyyaml in ./.venv/lib/python3.11/site-packages (from donfig>=0.8->zarr!=3.0.*,>=2.18.7->anndata>=0.10->omicslog==0.1.0) (6.0.3)\n", + "Building wheels for collected packages: omicslog\n", + " Building editable for omicslog (pyproject.toml) ... \u001b[?25ldone\n", + "\u001b[?25h Created wheel for omicslog: filename=omicslog-0.1.0-py3-none-any.whl size=1718 sha256=dc6ec973c80cac950637ddf226c06e3733b3b894aed9e8a975c60a224fd85241\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-407htgir/wheels/37/30/e1/b1863257fe11c57155b308a6dae35341ed577d772e51eb2fc7\n", + "Successfully built omicslog\n", + "Installing collected packages: omicslog\n", + " Attempting uninstall: omicslog\n", + " Found existing installation: omicslog 0.1.0\n", + " Uninstalling omicslog-0.1.0:\n", + " Successfully uninstalled omicslog-0.1.0\n", + "Successfully installed omicslog-0.1.0\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "pip install -e ." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "14411cbf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.12.10\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_51390/1635567480.py:6: FutureWarning: `__version__` is deprecated, use `importlib.metadata.version('anndata')` instead.\n", + " print(ad.__version__)\n" + ] + } + ], + "source": [ + "from omicslog import log_start\n", + "import numpy as np\n", + "import pandas as pd\n", + "import anndata as ad\n", + "from scipy.sparse import csr_matrix\n", + "print(ad.__version__)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "c285a9b4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AnnData object with n_obs × n_vars = 100 × 2000\n", + " uns: '_omicslog'\n", + "AnnData object with n_obs × n_vars = 100 × 2000\n", + " obs: 'cell_type'\n", + " uns: '_omicslog'\n", + "\n", + "Operation log:\n", + "[2026-05-20 12:12:56] obs: 'cell_type' added\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TimeOperationMessage
02026-05-20 12:12:56obs'cell_type' added
\n", + "
" + ], + "text/plain": [ + " Time Operation Message\n", + "0 2026-05-20 12:12:56 obs 'cell_type' added" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "counts = csr_matrix(np.random.poisson(1, size=(100, 2000)), dtype=np.float32)\n", + "adata = ad.AnnData(counts)\n", + "\n", + "adata.obs_names = [f\"Cell_{i:d}\" for i in range(adata.n_obs)]\n", + "adata.var_names = [f\"Gene_{i:d}\" for i in range(adata.n_vars)]\n", + "\n", + "logdata = log_start(adata)\n", + "print(logdata)\n", + "\n", + "ct = np.random.choice([\"B\", \"T\", \"Monocyte\"], size=(logdata.n_obs,))\n", + "logdata.obs[\"cell_type\"] = pd.Categorical(ct) # Categoricals are preferred for efficiency\n", + "print(logdata)\n", + "logdata.uns[\"_omicslog\"]" + ] + }, + { + "cell_type": "markdown", + "id": "b88c830d", + "metadata": {}, + "source": [ + "# Fltrating by Cells (.obs)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "b88b1e3a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AnnData object with n_obs × n_vars = 31 × 2000\n", + " obs: 'cell_type'\n", + " uns: '_omicslog'\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TimeOperationMessage
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [Time, Operation, Message]\n", + "Index: []" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "logdata = logdata[logdata.obs.cell_type == \"B\"]\n", + "print(logdata)\n", + "logdata.uns[\"_omicslog\"]" + ] + }, + { + "cell_type": "markdown", + "id": "6bcdb40a", + "metadata": {}, + "source": [ + "# Filtering by Genes (.var)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "8f7ed85f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AnnData object with n_obs × n_vars = 31 × 200\n", + " obs: 'cell_type'\n", + " uns: '_omicslog'\n", + "\n", + "Operation log:\n", + "[2026-05-20 12:14:09] subset: removed 1800 genes (90%), 200 genes remaining\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TimeOperationMessage
02026-05-20 12:14:09subsetremoved 1800 genes (90%), 200 genes remaining
\n", + "
" + ], + "text/plain": [ + " Time Operation \\\n", + "0 2026-05-20 12:14:09 subset \n", + "\n", + " Message \n", + "0 removed 1800 genes (90%), 200 genes remaining " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "logdata = logdata[:,logdata.var_names.str.endswith(\"1\")]\n", + "print(logdata)\n", + "logdata.uns[\"_omicslog\"]" + ] + }, + { + "cell_type": "markdown", + "id": "a679a08e", + "metadata": {}, + "source": [ + "# Adding observatons and variables" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "0fb2c8b9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AnnData object with n_obs × n_vars = 31 × 200\n", + " obs: 'cell_type'\n", + " uns: '_omicslog'\n", + " obsm: 'X_umap'\n", + " varm: 'gene_stuff'\n", + "\n", + "Operation log:\n", + "[2026-05-20 12:15:14] varm: 'gene_stuff' added\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TimeOperationMessage
02026-05-20 12:15:14varm'gene_stuff' added
\n", + "
" + ], + "text/plain": [ + " Time Operation Message\n", + "0 2026-05-20 12:15:14 varm 'gene_stuff' added" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "logdata.obsm[\"X_umap\"] = np.random.normal(0, 1, size=(logdata.n_obs, 2))\n", + "logdata.varm[\"gene_stuff\"] = np.random.normal(0, 1, size=(logdata.n_vars, 5))\n", + "print(logdata)\n", + "logdata.uns[\"_omicslog\"]" + ] + }, + { + "cell_type": "markdown", + "id": "2021a54b", + "metadata": {}, + "source": [ + "# Adding layers" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "a8c41222", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AnnData object with n_obs × n_vars = 31 × 200\n", + " obs: 'cell_type'\n", + " uns: '_omicslog'\n", + " obsm: 'X_umap'\n", + " varm: 'gene_stuff'\n", + " layers: 'log_transformed'\n", + "\n", + "Operation log:\n", + "[2026-05-20 12:16:14] layers: 'log_transformed' added\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TimeOperationMessage
02026-05-20 12:16:14layers'log_transformed' added
\n", + "
" + ], + "text/plain": [ + " Time Operation Message\n", + "0 2026-05-20 12:16:14 layers 'log_transformed' added" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "logdata.layers[\"log_transformed\"] = np.log1p(logdata.X)\n", + "print(logdata)\n", + "logdata.uns[\"_omicslog\"]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/omicslog/core.py b/src/omicslog/core.py index 8cd4872..dd33d60 100644 --- a/src/omicslog/core.py +++ b/src/omicslog/core.py @@ -6,42 +6,52 @@ from typing import Any import anndata as ad +import pandas as pd LOG_KEY = "_omicslog" +def _safe_deepcopy_dict(d: dict) -> dict: + result = {} + for k, v in d.items(): + try: + result[k] = deepcopy(v) + except (TypeError, Exception): + pass + return result + def _timestamp() -> str: return datetime.now().strftime("%Y-%m-%d %H:%M:%S") -def _format_log_message(operation: str, message: str, ts: str | None = None) -> str: +def _format_log_message(operation: str, message: str, ts: str | None = None) -> list[str]: stamp = ts or _timestamp() - return f"[{stamp}] {operation}: {message}" + return [stamp, operation, message] -def _ensure_log_container(adata: ad.AnnData) -> list[str]: +def _ensure_log_container(adata: ad.AnnData) -> pd.DataFrame: current = adata.uns.get(LOG_KEY) if not isinstance(current, list): - adata.uns[LOG_KEY] = [] + adata.uns[LOG_KEY] = pd.DataFrame(columns=["Time","Operation","Message"]) return adata.uns[LOG_KEY] def _append_log_messages(adata: ad.AnnData, messages: list[str] | tuple[str, ...]) -> None: if not messages: return - _ensure_log_container(adata).extend(messages) - + container = _ensure_log_container(adata) + new_rows = pd.DataFrame(messages, columns=["Time", "Operation", "Message"]) + adata.uns[LOG_KEY] = pd.concat([container, new_rows], ignore_index=True) def _inherit_and_append( parent: ad.AnnData, child: ad.AnnData, messages: list[str] | tuple[str, ...], ) -> None: - child.uns[LOG_KEY] = list(_ensure_log_container(parent)) + child.uns[LOG_KEY] = _ensure_log_container(parent) _append_log_messages(child, messages) - -def _parent_set(obj: Any, attr: str, value: Any) -> None: +def _parent_set(obj, attr: str, value) -> None: """Set an attribute via the first parent class that defines it. Works for both plain properties (.fset) and custom descriptors (.__set__). """ @@ -51,18 +61,18 @@ def _parent_set(obj: Any, attr: str, value: Any) -> None: return object.__setattr__(obj, attr, value) - class _LoggingProxy: - """Transparent proxy for dict-like AnnData components. + """ + Transparent proxy for dict-like AnnData components (layers, obsm, varm, ...). Intercepts __setitem__ and __delitem__ to log mutations automatically. """ - def __init__(self, wrapped: Any, owner: "LoggedAnnDataStandalone", label: str): + def __init__(self, wrapped, owner: "LoggedAnnDataStandalone", label: str): object.__setattr__(self, "_w", wrapped) object.__setattr__(self, "_owner", owner) object.__setattr__(self, "_label", label) - def __setitem__(self, key: str, value: Any) -> None: + def __setitem__(self, key: str, value) -> None: w = object.__getattribute__(self, "_w") owner = object.__getattribute__(self, "_owner") label = object.__getattribute__(self, "_label") @@ -77,29 +87,28 @@ def __delitem__(self, key: str) -> None: del w[key] _append_log_messages(owner, [_format_log_message(label, f"'{key}' removed")]) - def __getitem__(self, key: Any) -> Any: + def __getitem__(self, key): return object.__getattribute__(self, "_w")[key] - def __getattr__(self, name: str) -> Any: + def __getattr__(self, name): return getattr(object.__getattribute__(self, "_w"), name) - def __contains__(self, key: Any) -> bool: + def __contains__(self, key): return key in object.__getattribute__(self, "_w") def __iter__(self): return iter(object.__getattribute__(self, "_w")) - def __len__(self) -> int: + def __len__(self): return len(object.__getattribute__(self, "_w")) - def __repr__(self) -> str: + def __repr__(self): return repr(object.__getattribute__(self, "_w")) @dataclass class AnnDataSnapshot: - """Captures the key state of an AnnData object for diffing.""" - + """Captures the full state of an AnnData object for diffing.""" n_obs: int n_vars: int obs_cols: list[str] = field(default_factory=list) @@ -178,14 +187,14 @@ def _subset_messages( class LoggedAnnDataStandalone(ad.AnnData): - """AnnData subclass that automatically logs mutations to .uns['_omicslog'].""" + """Standalone subclass strategy with local logging helpers and message style.""" def __init__(self, *args: Any, **kwargs: Any): super().__init__(*args, **kwargs) _ensure_log_container(self) @classmethod - def _safe_component_copy(cls, value: Any) -> Any: + def _safe_component_copy(cls, value): return value.copy() if hasattr(value, "copy") else deepcopy(value) @classmethod @@ -198,13 +207,14 @@ def from_anndata(cls, adata: ad.AnnData) -> "LoggedAnnDataStandalone": "X": cls._safe_component_copy(adata.X) if adata.X is not None else None, "obs": adata.obs.copy(), "var": adata.var.copy(), - "uns": deepcopy(dict(adata.uns)), + "uns": _safe_deepcopy_dict(dict(adata.uns)), "obsm": {k: cls._safe_component_copy(v) for k, v in adata.obsm.items()}, "varm": {k: cls._safe_component_copy(v) for k, v in adata.varm.items()}, "layers": {k: cls._safe_component_copy(v) for k, v in adata.layers.items()}, "obsp": {k: cls._safe_component_copy(v) for k, v in adata.obsp.items()}, "varp": {k: cls._safe_component_copy(v) for k, v in adata.varp.items()}, } + if adata.raw is not None: kwargs["raw"] = { "X": cls._safe_component_copy(adata.raw.X), @@ -216,14 +226,14 @@ def from_anndata(cls, adata: ad.AnnData) -> "LoggedAnnDataStandalone": _ensure_log_container(logged) return logged - # --- proxied properties: intercept direct dict-style mutations --- + # --- proxied properties: each needs a getter AND a setter --- @property def layers(self): return _LoggingProxy(super().layers, self, "layers") @layers.setter - def layers(self, value: Any) -> None: + def layers(self, value): _parent_set(self, "layers", value) @property @@ -231,7 +241,7 @@ def obsm(self): return _LoggingProxy(super().obsm, self, "obsm") @obsm.setter - def obsm(self, value: Any) -> None: + def obsm(self, value): _parent_set(self, "obsm", value) @property @@ -239,7 +249,7 @@ def varm(self): return _LoggingProxy(super().varm, self, "varm") @varm.setter - def varm(self, value: Any) -> None: + def varm(self, value): _parent_set(self, "varm", value) @property @@ -247,7 +257,7 @@ def obsp(self): return _LoggingProxy(super().obsp, self, "obsp") @obsp.setter - def obsp(self, value: Any) -> None: + def obsp(self, value): _parent_set(self, "obsp", value) @property @@ -255,7 +265,7 @@ def varp(self): return _LoggingProxy(super().varp, self, "varp") @varp.setter - def varp(self, value: Any) -> None: + def varp(self, value): _parent_set(self, "varp", value) @property @@ -263,7 +273,7 @@ def obs(self): return _LoggingProxy(super().obs, self, "obs") @obs.setter - def obs(self, value: Any) -> None: + def obs(self, value): _parent_set(self, "obs", value) @property @@ -271,15 +281,15 @@ def var(self): return _LoggingProxy(super().var, self, "var") @var.setter - def var(self, value: Any) -> None: - _parent_set(self, "var", value) + def var(self, value): + ad.AnnData.var.fset(self, value) # --- snapshot & subsetting --- def _snapshot(self) -> AnnDataSnapshot: return AnnDataSnapshot.from_anndata(self) - def __getitem__(self, index: Any) -> "LoggedAnnDataStandalone": + def __getitem__(self, index): pre = self._snapshot() result = super().__getitem__(index) logged_result = self.from_anndata(result) @@ -287,13 +297,18 @@ def __getitem__(self, index: Any) -> "LoggedAnnDataStandalone": _inherit_and_append(self, logged_result, msgs) return logged_result - def _inplace_subset(self, index: Any) -> None: + def _inplace_subset(self, index): pre = self._snapshot() super()._inplace_subset(index) _append_log_messages(self, _subset_messages(pre, self._snapshot(), operation="subset")) def _operation_log_block(self) -> str: logs = self.uns.get(LOG_KEY, []) + if isinstance(logs, pd.DataFrame): + if logs.empty: + return "" + rows = logs.apply(lambda r: f"[{r['Time']}] {r['Operation']}: {r['Message']}", axis=1) + return "\n\nOperation log:\n" + "\n".join(rows) if not logs: return "" return "\n\nOperation log:\n" + "\n".join(str(x) for x in logs) @@ -309,5 +324,4 @@ def operation_log(self) -> list[str]: def log_start(adata: ad.AnnData) -> LoggedAnnDataStandalone: - """Convert an AnnData object into a LoggedAnnDataStandalone instance.""" - return LoggedAnnDataStandalone.from_anndata(adata) + return LoggedAnnDataStandalone.from_anndata(adata) \ No newline at end of file From 0b195c76496c2b3b2c5e04554d98c89c46fdc412 Mon Sep 17 00:00:00 2001 From: jdhenaos Date: Wed, 20 May 2026 14:33:57 +0000 Subject: [PATCH 3/3] table format functional --- README.md | 412 ++++++++++++++++++++++++++++++++++++++++--- demo.ipynb | 288 +++++++++++++++++++----------- make_readme.sh | 1 + pyproject.toml | 7 +- src/omicslog/core.py | 2 +- 5 files changed, 579 insertions(+), 131 deletions(-) create mode 100644 make_readme.sh diff --git a/README.md b/README.md index d30eb23..6babcea 100644 --- a/README.md +++ b/README.md @@ -1,28 +1,398 @@ -# CompBio template: R (renv) + Python (Poetry) in separate Dev Containers +# Omicslog -## Before to start -1. Modify the `.devcontainer/.env` file with the path for your raw data. +## Importing packages -## Quick start (VS Code) -1. Open this repository in VS Code. -2. Command Palette → **Dev Containers: Reopen in Container** -3. Choose either: - - **project (R + renv)** for Bioconductor/ggplot work - - **project (Python + Poetry)** for ML/AI work -Both containers mount the same repository, so `data/` and `results/` are shared. +```python +from omicslog import log_start +import numpy as np +import pandas as pd +import anndata as ad +from scipy.sparse import csr_matrix -## R workflow (inside R container) -- Initialize renv (first time): `make r-init` -- Restore: `make r-restore` -- Snapshot: `make r-snapshot` -- Smoke test: `make r-check` +``` -## Python workflow (inside Python container) -- Install deps: `make py-install` -- Lock: `make py-lock` -- Smoke test: `make py-check` +
+📝 Note
+The AnnData object were generated using code from the original AnnData documentation. +
+ + +```python +counts = csr_matrix(np.random.poisson(1, size=(100, 2000)), dtype=np.float32) +adata = ad.AnnData(counts) + +adata.obs_names = [f"Cell_{i:d}" for i in range(adata.n_obs)] +adata.var_names = [f"Gene_{i:d}" for i in range(adata.n_vars)] + +logdata = log_start(adata) +print(logdata) + +ct = np.random.choice(["B", "T", "Monocyte"], size=(logdata.n_obs,)) +logdata.obs["cell_type"] = pd.Categorical(ct) # Categoricals are preferred for efficiency +print(logdata) +logdata.uns["_omicslog"] +``` + + AnnData object with n_obs × n_vars = 100 × 2000 + uns: '_omicslog' + AnnData object with n_obs × n_vars = 100 × 2000 + obs: 'cell_type' + uns: '_omicslog' + + Operation log: + [2026-05-20 13:49:41] obs: 'cell_type' added + + + + + +
+ + + + + + + + + + + + + + + + + + +
TimeOperationMessage
02026-05-20 13:49:41obs'cell_type' added
+
+ + + +## Fltrating by Cells (.obs) + + +```python +logdata = logdata[logdata.obs.cell_type == "B"] +print(logdata) +logdata.uns["_omicslog"] +``` + + AnnData object with n_obs × n_vars = 27 × 2000 + obs: 'cell_type' + uns: '_omicslog' + + Operation log: + [2026-05-20 13:49:41] obs: 'cell_type' added + [2026-05-20 13:49:43] subset: removed 73 samples (73%), 27 samples remaining + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + +
TimeOperationMessage
02026-05-20 13:49:41obs'cell_type' added
12026-05-20 13:49:43subsetremoved 73 samples (73%), 27 samples remaining
+
+ + + +## Filtering by Genes (.var) + + +```python +logdata = logdata[:,logdata.var_names.str.endswith("1")] +print(logdata) +logdata.uns["_omicslog"] +``` + + AnnData object with n_obs × n_vars = 27 × 200 + obs: 'cell_type' + uns: '_omicslog' + + Operation log: + [2026-05-20 13:49:41] obs: 'cell_type' added + [2026-05-20 13:49:43] subset: removed 73 samples (73%), 27 samples remaining + [2026-05-20 13:49:46] subset: removed 1800 genes (90%), 200 genes remaining + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
TimeOperationMessage
02026-05-20 13:49:41obs'cell_type' added
12026-05-20 13:49:43subsetremoved 73 samples (73%), 27 samples remaining
22026-05-20 13:49:46subsetremoved 1800 genes (90%), 200 genes remaining
+
+ + + +## Adding observatons and variables + + +```python +logdata.obsm["X_umap"] = np.random.normal(0, 1, size=(logdata.n_obs, 2)) +logdata.varm["gene_stuff"] = np.random.normal(0, 1, size=(logdata.n_vars, 5)) +print(logdata) +logdata.uns["_omicslog"] +``` + + AnnData object with n_obs × n_vars = 27 × 200 + obs: 'cell_type' + uns: '_omicslog' + obsm: 'X_umap' + varm: 'gene_stuff' + + Operation log: + [2026-05-20 13:49:41] obs: 'cell_type' added + [2026-05-20 13:49:43] subset: removed 73 samples (73%), 27 samples remaining + [2026-05-20 13:49:46] subset: removed 1800 genes (90%), 200 genes remaining + [2026-05-20 13:49:48] obsm: 'X_umap' added + [2026-05-20 13:49:48] varm: 'gene_stuff' added + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
TimeOperationMessage
02026-05-20 13:49:41obs'cell_type' added
12026-05-20 13:49:43subsetremoved 73 samples (73%), 27 samples remaining
22026-05-20 13:49:46subsetremoved 1800 genes (90%), 200 genes remaining
32026-05-20 13:49:48obsm'X_umap' added
42026-05-20 13:49:48varm'gene_stuff' added
+
+ + + +## Adding layers + + +```python +logdata.layers["log_transformed"] = np.log1p(logdata.X) +print(logdata) +logdata.uns["_omicslog"] +``` + + AnnData object with n_obs × n_vars = 27 × 200 + obs: 'cell_type' + uns: '_omicslog' + obsm: 'X_umap' + varm: 'gene_stuff' + layers: 'log_transformed' + + Operation log: + [2026-05-20 13:49:41] obs: 'cell_type' added + [2026-05-20 13:49:43] subset: removed 73 samples (73%), 27 samples remaining + [2026-05-20 13:49:46] subset: removed 1800 genes (90%), 200 genes remaining + [2026-05-20 13:49:48] obsm: 'X_umap' added + [2026-05-20 13:49:48] varm: 'gene_stuff' added + [2026-05-20 13:49:50] layers: 'log_transformed' added + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
TimeOperationMessage
02026-05-20 13:49:41obs'cell_type' added
12026-05-20 13:49:43subsetremoved 73 samples (73%), 27 samples remaining
22026-05-20 13:49:46subsetremoved 1800 genes (90%), 200 genes remaining
32026-05-20 13:49:48obsm'X_umap' added
42026-05-20 13:49:48varm'gene_stuff' added
52026-05-20 13:49:50layers'log_transformed' added
+
-## CI -GitHub Actions builds both images and runs smoke tests on push/PR. diff --git a/demo.ipynb b/demo.ipynb index d47097f..9cf0481 100644 --- a/demo.ipynb +++ b/demo.ipynb @@ -1,92 +1,43 @@ { "cells": [ { - "cell_type": "code", - "execution_count": 6, - "id": "d1a1434b", + "cell_type": "markdown", + "id": "e19092cb", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Obtaining file:///workspaces/project\n", - " Installing build dependencies ... \u001b[?25ldone\n", - "\u001b[?25h Checking if build backend supports build_editable ... \u001b[?25ldone\n", - "\u001b[?25h Getting requirements to build editable ... \u001b[?25ldone\n", - "\u001b[?25h Preparing editable metadata (pyproject.toml) ... \u001b[?25ldone\n", - "\u001b[?25hRequirement already satisfied: anndata>=0.10 in ./.venv/lib/python3.11/site-packages (from omicslog==0.1.0) (0.12.10)\n", - "Requirement already satisfied: numpy<3.0.0,>=2.0.0 in ./.venv/lib/python3.11/site-packages (from omicslog==0.1.0) (2.3.5)\n", - "Requirement already satisfied: array-api-compat>=1.7.1 in ./.venv/lib/python3.11/site-packages (from anndata>=0.10->omicslog==0.1.0) (1.13.0)\n", - "Requirement already satisfied: h5py>=3.8 in ./.venv/lib/python3.11/site-packages (from anndata>=0.10->omicslog==0.1.0) (3.15.1)\n", - "Requirement already satisfied: legacy-api-wrap in ./.venv/lib/python3.11/site-packages (from anndata>=0.10->omicslog==0.1.0) (1.5)\n", - "Requirement already satisfied: natsort in ./.venv/lib/python3.11/site-packages (from anndata>=0.10->omicslog==0.1.0) (8.4.0)\n", - "Requirement already satisfied: packaging>=24.2 in ./.venv/lib/python3.11/site-packages (from anndata>=0.10->omicslog==0.1.0) (26.0)\n", - "Requirement already satisfied: pandas!=2.1.2,<3,>=2.1.0 in ./.venv/lib/python3.11/site-packages (from anndata>=0.10->omicslog==0.1.0) (2.3.3)\n", - "Requirement already satisfied: scipy!=1.17.0,>=1.12 in ./.venv/lib/python3.11/site-packages (from anndata>=0.10->omicslog==0.1.0) (1.16.3)\n", - "Requirement already satisfied: zarr!=3.0.*,>=2.18.7 in ./.venv/lib/python3.11/site-packages (from anndata>=0.10->omicslog==0.1.0) (3.1.5)\n", - "Requirement already satisfied: python-dateutil>=2.8.2 in ./.venv/lib/python3.11/site-packages (from pandas!=2.1.2,<3,>=2.1.0->anndata>=0.10->omicslog==0.1.0) (2.9.0.post0)\n", - "Requirement already satisfied: pytz>=2020.1 in ./.venv/lib/python3.11/site-packages (from pandas!=2.1.2,<3,>=2.1.0->anndata>=0.10->omicslog==0.1.0) (2025.2)\n", - "Requirement already satisfied: tzdata>=2022.7 in ./.venv/lib/python3.11/site-packages (from pandas!=2.1.2,<3,>=2.1.0->anndata>=0.10->omicslog==0.1.0) (2025.3)\n", - "Requirement already satisfied: six>=1.5 in ./.venv/lib/python3.11/site-packages (from python-dateutil>=2.8.2->pandas!=2.1.2,<3,>=2.1.0->anndata>=0.10->omicslog==0.1.0) (1.17.0)\n", - "Requirement already satisfied: donfig>=0.8 in ./.venv/lib/python3.11/site-packages (from zarr!=3.0.*,>=2.18.7->anndata>=0.10->omicslog==0.1.0) (0.8.1.post1)\n", - "Requirement already satisfied: google-crc32c>=1.5 in ./.venv/lib/python3.11/site-packages (from zarr!=3.0.*,>=2.18.7->anndata>=0.10->omicslog==0.1.0) (1.8.0)\n", - "Requirement already satisfied: numcodecs>=0.14 in ./.venv/lib/python3.11/site-packages (from zarr!=3.0.*,>=2.18.7->anndata>=0.10->omicslog==0.1.0) (0.16.5)\n", - "Requirement already satisfied: typing-extensions>=4.9 in ./.venv/lib/python3.11/site-packages (from zarr!=3.0.*,>=2.18.7->anndata>=0.10->omicslog==0.1.0) (4.15.0)\n", - "Requirement already satisfied: pyyaml in ./.venv/lib/python3.11/site-packages (from donfig>=0.8->zarr!=3.0.*,>=2.18.7->anndata>=0.10->omicslog==0.1.0) (6.0.3)\n", - "Building wheels for collected packages: omicslog\n", - " Building editable for omicslog (pyproject.toml) ... \u001b[?25ldone\n", - "\u001b[?25h Created wheel for omicslog: filename=omicslog-0.1.0-py3-none-any.whl size=1718 sha256=dc6ec973c80cac950637ddf226c06e3733b3b894aed9e8a975c60a224fd85241\n", - " Stored in directory: /tmp/pip-ephem-wheel-cache-407htgir/wheels/37/30/e1/b1863257fe11c57155b308a6dae35341ed577d772e51eb2fc7\n", - "Successfully built omicslog\n", - "Installing collected packages: omicslog\n", - " Attempting uninstall: omicslog\n", - " Found existing installation: omicslog 0.1.0\n", - " Uninstalling omicslog-0.1.0:\n", - " Successfully uninstalled omicslog-0.1.0\n", - "Successfully installed omicslog-0.1.0\n", - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], "source": [ - "pip install -e ." + "# Omicslog\n", + "\n", + "## Importing packages" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 19, "id": "14411cbf", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.12.10\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_51390/1635567480.py:6: FutureWarning: `__version__` is deprecated, use `importlib.metadata.version('anndata')` instead.\n", - " print(ad.__version__)\n" - ] - } - ], + "outputs": [], "source": [ "from omicslog import log_start\n", "import numpy as np\n", "import pandas as pd\n", "import anndata as ad\n", - "from scipy.sparse import csr_matrix\n", - "print(ad.__version__)\n" + "from scipy.sparse import csr_matrix\n" + ] + }, + { + "cell_type": "markdown", + "id": "99bec3f9", + "metadata": {}, + "source": [ + "
\n", + "📝 Note
\n", + "The AnnData object were generated using code from the original AnnData documentation.\n", + "
" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 14, "id": "c285a9b4", "metadata": {}, "outputs": [ @@ -101,7 +52,7 @@ " uns: '_omicslog'\n", "\n", "Operation log:\n", - "[2026-05-20 12:12:56] obs: 'cell_type' added\n" + "[2026-05-20 13:49:41] obs: 'cell_type' added\n" ] }, { @@ -133,7 +84,7 @@ " \n", " \n", " 0\n", - " 2026-05-20 12:12:56\n", + " 2026-05-20 13:49:41\n", " obs\n", " 'cell_type' added\n", " \n", @@ -143,10 +94,10 @@ ], "text/plain": [ " Time Operation Message\n", - "0 2026-05-20 12:12:56 obs 'cell_type' added" + "0 2026-05-20 13:49:41 obs 'cell_type' added" ] }, - "execution_count": 9, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -172,12 +123,12 @@ "id": "b88c830d", "metadata": {}, "source": [ - "# Fltrating by Cells (.obs)" + "## Fltrating by Cells (.obs)" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 15, "id": "b88b1e3a", "metadata": {}, "outputs": [ @@ -185,9 +136,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "AnnData object with n_obs × n_vars = 31 × 2000\n", + "AnnData object with n_obs × n_vars = 27 × 2000\n", " obs: 'cell_type'\n", - " uns: '_omicslog'\n" + " uns: '_omicslog'\n", + "\n", + "Operation log:\n", + "[2026-05-20 13:49:41] obs: 'cell_type' added\n", + "[2026-05-20 13:49:43] subset: removed 73 samples (73%), 27 samples remaining\n" ] }, { @@ -217,17 +172,33 @@ " \n", " \n", " \n", + " \n", + " 0\n", + " 2026-05-20 13:49:41\n", + " obs\n", + " 'cell_type' added\n", + " \n", + " \n", + " 1\n", + " 2026-05-20 13:49:43\n", + " subset\n", + " removed 73 samples (73%), 27 samples remaining\n", + " \n", " \n", "\n", "" ], "text/plain": [ - "Empty DataFrame\n", - "Columns: [Time, Operation, Message]\n", - "Index: []" + " Time Operation \\\n", + "0 2026-05-20 13:49:41 obs \n", + "1 2026-05-20 13:49:43 subset \n", + "\n", + " Message \n", + "0 'cell_type' added \n", + "1 removed 73 samples (73%), 27 samples remaining " ] }, - "execution_count": 11, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -243,12 +214,12 @@ "id": "6bcdb40a", "metadata": {}, "source": [ - "# Filtering by Genes (.var)" + "## Filtering by Genes (.var)" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 16, "id": "8f7ed85f", "metadata": {}, "outputs": [ @@ -256,12 +227,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "AnnData object with n_obs × n_vars = 31 × 200\n", + "AnnData object with n_obs × n_vars = 27 × 200\n", " obs: 'cell_type'\n", " uns: '_omicslog'\n", "\n", "Operation log:\n", - "[2026-05-20 12:14:09] subset: removed 1800 genes (90%), 200 genes remaining\n" + "[2026-05-20 13:49:41] obs: 'cell_type' added\n", + "[2026-05-20 13:49:43] subset: removed 73 samples (73%), 27 samples remaining\n", + "[2026-05-20 13:49:46] subset: removed 1800 genes (90%), 200 genes remaining\n" ] }, { @@ -293,7 +266,19 @@ " \n", " \n", " 0\n", - " 2026-05-20 12:14:09\n", + " 2026-05-20 13:49:41\n", + " obs\n", + " 'cell_type' added\n", + " \n", + " \n", + " 1\n", + " 2026-05-20 13:49:43\n", + " subset\n", + " removed 73 samples (73%), 27 samples remaining\n", + " \n", + " \n", + " 2\n", + " 2026-05-20 13:49:46\n", " subset\n", " removed 1800 genes (90%), 200 genes remaining\n", " \n", @@ -303,13 +288,17 @@ ], "text/plain": [ " Time Operation \\\n", - "0 2026-05-20 12:14:09 subset \n", + "0 2026-05-20 13:49:41 obs \n", + "1 2026-05-20 13:49:43 subset \n", + "2 2026-05-20 13:49:46 subset \n", "\n", - " Message \n", - "0 removed 1800 genes (90%), 200 genes remaining " + " Message \n", + "0 'cell_type' added \n", + "1 removed 73 samples (73%), 27 samples remaining \n", + "2 removed 1800 genes (90%), 200 genes remaining " ] }, - "execution_count": 12, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -325,12 +314,12 @@ "id": "a679a08e", "metadata": {}, "source": [ - "# Adding observatons and variables" + "## Adding observatons and variables" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 17, "id": "0fb2c8b9", "metadata": {}, "outputs": [ @@ -338,14 +327,18 @@ "name": "stdout", "output_type": "stream", "text": [ - "AnnData object with n_obs × n_vars = 31 × 200\n", + "AnnData object with n_obs × n_vars = 27 × 200\n", " obs: 'cell_type'\n", " uns: '_omicslog'\n", " obsm: 'X_umap'\n", " varm: 'gene_stuff'\n", "\n", "Operation log:\n", - "[2026-05-20 12:15:14] varm: 'gene_stuff' added\n" + "[2026-05-20 13:49:41] obs: 'cell_type' added\n", + "[2026-05-20 13:49:43] subset: removed 73 samples (73%), 27 samples remaining\n", + "[2026-05-20 13:49:46] subset: removed 1800 genes (90%), 200 genes remaining\n", + "[2026-05-20 13:49:48] obsm: 'X_umap' added\n", + "[2026-05-20 13:49:48] varm: 'gene_stuff' added\n" ] }, { @@ -377,7 +370,31 @@ " \n", " \n", " 0\n", - " 2026-05-20 12:15:14\n", + " 2026-05-20 13:49:41\n", + " obs\n", + " 'cell_type' added\n", + " \n", + " \n", + " 1\n", + " 2026-05-20 13:49:43\n", + " subset\n", + " removed 73 samples (73%), 27 samples remaining\n", + " \n", + " \n", + " 2\n", + " 2026-05-20 13:49:46\n", + " subset\n", + " removed 1800 genes (90%), 200 genes remaining\n", + " \n", + " \n", + " 3\n", + " 2026-05-20 13:49:48\n", + " obsm\n", + " 'X_umap' added\n", + " \n", + " \n", + " 4\n", + " 2026-05-20 13:49:48\n", " varm\n", " 'gene_stuff' added\n", " \n", @@ -386,11 +403,22 @@ "" ], "text/plain": [ - " Time Operation Message\n", - "0 2026-05-20 12:15:14 varm 'gene_stuff' added" + " Time Operation \\\n", + "0 2026-05-20 13:49:41 obs \n", + "1 2026-05-20 13:49:43 subset \n", + "2 2026-05-20 13:49:46 subset \n", + "3 2026-05-20 13:49:48 obsm \n", + "4 2026-05-20 13:49:48 varm \n", + "\n", + " Message \n", + "0 'cell_type' added \n", + "1 removed 73 samples (73%), 27 samples remaining \n", + "2 removed 1800 genes (90%), 200 genes remaining \n", + "3 'X_umap' added \n", + "4 'gene_stuff' added " ] }, - "execution_count": 13, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -407,12 +435,12 @@ "id": "2021a54b", "metadata": {}, "source": [ - "# Adding layers" + "## Adding layers" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 18, "id": "a8c41222", "metadata": {}, "outputs": [ @@ -420,7 +448,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "AnnData object with n_obs × n_vars = 31 × 200\n", + "AnnData object with n_obs × n_vars = 27 × 200\n", " obs: 'cell_type'\n", " uns: '_omicslog'\n", " obsm: 'X_umap'\n", @@ -428,7 +456,12 @@ " layers: 'log_transformed'\n", "\n", "Operation log:\n", - "[2026-05-20 12:16:14] layers: 'log_transformed' added\n" + "[2026-05-20 13:49:41] obs: 'cell_type' added\n", + "[2026-05-20 13:49:43] subset: removed 73 samples (73%), 27 samples remaining\n", + "[2026-05-20 13:49:46] subset: removed 1800 genes (90%), 200 genes remaining\n", + "[2026-05-20 13:49:48] obsm: 'X_umap' added\n", + "[2026-05-20 13:49:48] varm: 'gene_stuff' added\n", + "[2026-05-20 13:49:50] layers: 'log_transformed' added\n" ] }, { @@ -460,7 +493,37 @@ " \n", " \n", " 0\n", - " 2026-05-20 12:16:14\n", + " 2026-05-20 13:49:41\n", + " obs\n", + " 'cell_type' added\n", + " \n", + " \n", + " 1\n", + " 2026-05-20 13:49:43\n", + " subset\n", + " removed 73 samples (73%), 27 samples remaining\n", + " \n", + " \n", + " 2\n", + " 2026-05-20 13:49:46\n", + " subset\n", + " removed 1800 genes (90%), 200 genes remaining\n", + " \n", + " \n", + " 3\n", + " 2026-05-20 13:49:48\n", + " obsm\n", + " 'X_umap' added\n", + " \n", + " \n", + " 4\n", + " 2026-05-20 13:49:48\n", + " varm\n", + " 'gene_stuff' added\n", + " \n", + " \n", + " 5\n", + " 2026-05-20 13:49:50\n", " layers\n", " 'log_transformed' added\n", " \n", @@ -469,11 +532,24 @@ "" ], "text/plain": [ - " Time Operation Message\n", - "0 2026-05-20 12:16:14 layers 'log_transformed' added" + " Time Operation \\\n", + "0 2026-05-20 13:49:41 obs \n", + "1 2026-05-20 13:49:43 subset \n", + "2 2026-05-20 13:49:46 subset \n", + "3 2026-05-20 13:49:48 obsm \n", + "4 2026-05-20 13:49:48 varm \n", + "5 2026-05-20 13:49:50 layers \n", + "\n", + " Message \n", + "0 'cell_type' added \n", + "1 removed 73 samples (73%), 27 samples remaining \n", + "2 removed 1800 genes (90%), 200 genes remaining \n", + "3 'X_umap' added \n", + "4 'gene_stuff' added \n", + "5 'log_transformed' added " ] }, - "execution_count": 14, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } diff --git a/make_readme.sh b/make_readme.sh new file mode 100644 index 0000000..cf971a5 --- /dev/null +++ b/make_readme.sh @@ -0,0 +1 @@ +poetry run jupyter nbconvert demo.ipynb --to markdown --output README \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index cf82774..431a26b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,11 +2,12 @@ name = "omicslog" version = "0.1.0" description = "AnnData subclass that automatically logs mutations to .uns['_omicslog']" -authors = ["Your Name "] +authors = ["Stefano Mangiola ", "Juan Henao "] +manteiners = ["Juan Henao "] license = "MIT" readme = "README.md" -homepage = "https://github.com/your-org/omicslog" -repository = "https://github.com/your-org/omicslog" +homepage = "https://github.com/tidyomics/omicslog_dev" +repository = "https://github.com/tidyomics/omicslog_dev" keywords = ["bioinformatics", "single-cell", "anndata", "logging"] classifiers = [ "Programming Language :: Python :: 3", diff --git a/src/omicslog/core.py b/src/omicslog/core.py index dd33d60..40db784 100644 --- a/src/omicslog/core.py +++ b/src/omicslog/core.py @@ -31,7 +31,7 @@ def _format_log_message(operation: str, message: str, ts: str | None = None) -> def _ensure_log_container(adata: ad.AnnData) -> pd.DataFrame: current = adata.uns.get(LOG_KEY) - if not isinstance(current, list): + if not isinstance(current, pd.DataFrame): adata.uns[LOG_KEY] = pd.DataFrame(columns=["Time","Operation","Message"]) return adata.uns[LOG_KEY]