diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml
deleted file mode 100644
index 76f6ad3..0000000
--- a/.devcontainer/docker-compose.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-version: "3.9"
-
-services:
- r-dev:
- build:
- context: ..
- dockerfile: .devcontainer/r/Dockerfile
- volumes:
- - ..:/workspaces/project:cached
- - renv_cache:/opt/renv/cache
- - ${SECURE_DATA_DIR:-}:/secure-data:ro
- working_dir: /workspaces/project
- command: sleep infinity
-
- py-dev:
- build:
- context: ..
- dockerfile: .devcontainer/py/Dockerfile
- volumes:
- - ..:/workspaces/project:cached
- - ${SECURE_DATA_DIR:-}:/secure-data:ro
- working_dir: /workspaces/project
- command: sleep infinity
-
-volumes:
- renv_cache:
- poetry_cache:
- pip_cache:
-
diff --git a/.devcontainer/r/Dockerfile b/.devcontainer/r/Dockerfile
deleted file mode 100644
index 6c2368b..0000000
--- a/.devcontainer/r/Dockerfile
+++ /dev/null
@@ -1,48 +0,0 @@
-# Pin R version here (adjust as needed)
-FROM rocker/r-ver:4.4.2
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-# System deps commonly needed for Bioconductor + ggplot + compilation
-RUN apt-get update && apt-get install -y --no-install-recommends \
- git \
- curl \
- wget \
- ca-certificates \
- locales \
- build-essential \
- pkg-config \
- libcurl4-openssl-dev \
- libssl-dev \
- libxml2-dev \
- libfontconfig1-dev \
- libfreetype6-dev \
- libpng-dev \
- libtiff5-dev \
- libjpeg-dev \
- libcairo2-dev \
- libharfbuzz-dev \
- libfribidi-dev \
- libglpk-dev \
- && rm -rf /var/lib/apt/lists/*
-
-# Locale (helps with some R packages / rendering)
-RUN sed -i 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \
- locale-gen
-ENV LANG=en_US.UTF-8
-ENV LC_ALL=en_US.UTF-8
-
-# Install renv in the system library (project will still use renv.lock)
-RUN R -q -e "install.packages('renv', repos='https://cloud.r-project.org')"
-
-# --- Quarto (pin version for reproducible rendering) ---
-ENV QUARTO_VERSION=1.5.57
-RUN wget -q "https://github.com/quarto-dev/quarto-cli/releases/download/v${QUARTO_VERSION}/quarto-${QUARTO_VERSION}-linux-amd64.deb" \
- && dpkg -i "quarto-${QUARTO_VERSION}-linux-amd64.deb" \
- && rm -f "quarto-${QUARTO_VERSION}-linux-amd64.deb"
-
-# Optional: speed up / standardize renv cache location (persisted via volume)
-ENV RENV_PATHS_CACHE=/opt/renv/cache
-
-WORKDIR /workspaces/project
-
diff --git a/.devcontainer/r/devcontainer.json b/.devcontainer/r/devcontainer.json
deleted file mode 100644
index d28daad..0000000
--- a/.devcontainer/r/devcontainer.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
- "name": "project (R + renv)",
- "dockerComposeFile": ["../docker-compose.yml"],
- "service": "r-dev",
- "workspaceFolder": "/workspaces/project",
- "shutdownAction": "stopCompose",
-
- "customizations": {
- "vscode": {
- "extensions": [
- "REditorSupport.r",
- "rdebugger.r-debugger",
- "Ikuyadeu.r",
- "ms-azuretools.vscode-docker",
- "GitHub.copilot"
- ],
- "settings": {
- "r.rterm.linux": "/usr/bin/R",
- "r.bracketedPaste": true
- }
- }
- },
-
- "postCreateCommand": "bash -lc 'quarto --version && if [ -f renv.lock ]; then R -q -e \"renv::restore(prompt = FALSE)\"; else echo \"No renv.lock yet. Run: R -q -e \\\"renv::init()\\\"\"; fi'"
-}
-
diff --git a/.gitignore b/.gitignore
index b2f88fe..0d024df 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,11 +8,8 @@ __pycache__/
.pytest_cache/
.mypy_cache/
.ruff_cache/
-
-# R
-.Rhistory
-.RData
-.Rproj.user/
+dist/
+*.egg-info/
# Outputs
results/
@@ -22,12 +19,6 @@ data/**/processed/
# OS
.DS_Store
-# Quarto
-.quarto/
-_quarto/
-_site/
-_freeze/
-
.devcontainer/.env
.env
config/local.env
diff --git a/README.md b/README.md
index d30eb23..6babcea 100644
--- a/README.md
+++ b/README.md
@@ -1,28 +1,398 @@
-# CompBio template: R (renv) + Python (Poetry) in separate Dev Containers
+# Omicslog
-## Before to start
-1. Modify the `.devcontainer/.env` file with the path for your raw data.
+## Importing packages
-## Quick start (VS Code)
-1. Open this repository in VS Code.
-2. Command Palette → **Dev Containers: Reopen in Container**
-3. Choose either:
- - **project (R + renv)** for Bioconductor/ggplot work
- - **project (Python + Poetry)** for ML/AI work
-Both containers mount the same repository, so `data/` and `results/` are shared.
+```python
+from omicslog import log_start
+import numpy as np
+import pandas as pd
+import anndata as ad
+from scipy.sparse import csr_matrix
-## R workflow (inside R container)
-- Initialize renv (first time): `make r-init`
-- Restore: `make r-restore`
-- Snapshot: `make r-snapshot`
-- Smoke test: `make r-check`
+```
-## Python workflow (inside Python container)
-- Install deps: `make py-install`
-- Lock: `make py-lock`
-- Smoke test: `make py-check`
+
+
📝 Note
+The AnnData object were generated using code from the original
AnnData documentation.
+
+
+
+```python
+counts = csr_matrix(np.random.poisson(1, size=(100, 2000)), dtype=np.float32)
+adata = ad.AnnData(counts)
+
+adata.obs_names = [f"Cell_{i:d}" for i in range(adata.n_obs)]
+adata.var_names = [f"Gene_{i:d}" for i in range(adata.n_vars)]
+
+logdata = log_start(adata)
+print(logdata)
+
+ct = np.random.choice(["B", "T", "Monocyte"], size=(logdata.n_obs,))
+logdata.obs["cell_type"] = pd.Categorical(ct) # Categoricals are preferred for efficiency
+print(logdata)
+logdata.uns["_omicslog"]
+```
+
+ AnnData object with n_obs × n_vars = 100 × 2000
+ uns: '_omicslog'
+ AnnData object with n_obs × n_vars = 100 × 2000
+ obs: 'cell_type'
+ uns: '_omicslog'
+
+ Operation log:
+ [2026-05-20 13:49:41] obs: 'cell_type' added
+
+
+
+
+
+
+
+
+
+
+ |
+ Time |
+ Operation |
+ Message |
+
+
+
+
+ | 0 |
+ 2026-05-20 13:49:41 |
+ obs |
+ 'cell_type' added |
+
+
+
+
+
+
+
+## Fltrating by Cells (.obs)
+
+
+```python
+logdata = logdata[logdata.obs.cell_type == "B"]
+print(logdata)
+logdata.uns["_omicslog"]
+```
+
+ AnnData object with n_obs × n_vars = 27 × 2000
+ obs: 'cell_type'
+ uns: '_omicslog'
+
+ Operation log:
+ [2026-05-20 13:49:41] obs: 'cell_type' added
+ [2026-05-20 13:49:43] subset: removed 73 samples (73%), 27 samples remaining
+
+
+
+
+
+
+
+
+
+
+ |
+ Time |
+ Operation |
+ Message |
+
+
+
+
+ | 0 |
+ 2026-05-20 13:49:41 |
+ obs |
+ 'cell_type' added |
+
+
+ | 1 |
+ 2026-05-20 13:49:43 |
+ subset |
+ removed 73 samples (73%), 27 samples remaining |
+
+
+
+
+
+
+
+## Filtering by Genes (.var)
+
+
+```python
+logdata = logdata[:,logdata.var_names.str.endswith("1")]
+print(logdata)
+logdata.uns["_omicslog"]
+```
+
+ AnnData object with n_obs × n_vars = 27 × 200
+ obs: 'cell_type'
+ uns: '_omicslog'
+
+ Operation log:
+ [2026-05-20 13:49:41] obs: 'cell_type' added
+ [2026-05-20 13:49:43] subset: removed 73 samples (73%), 27 samples remaining
+ [2026-05-20 13:49:46] subset: removed 1800 genes (90%), 200 genes remaining
+
+
+
+
+
+
+
+
+
+
+ |
+ Time |
+ Operation |
+ Message |
+
+
+
+
+ | 0 |
+ 2026-05-20 13:49:41 |
+ obs |
+ 'cell_type' added |
+
+
+ | 1 |
+ 2026-05-20 13:49:43 |
+ subset |
+ removed 73 samples (73%), 27 samples remaining |
+
+
+ | 2 |
+ 2026-05-20 13:49:46 |
+ subset |
+ removed 1800 genes (90%), 200 genes remaining |
+
+
+
+
+
+
+
+## Adding observatons and variables
+
+
+```python
+logdata.obsm["X_umap"] = np.random.normal(0, 1, size=(logdata.n_obs, 2))
+logdata.varm["gene_stuff"] = np.random.normal(0, 1, size=(logdata.n_vars, 5))
+print(logdata)
+logdata.uns["_omicslog"]
+```
+
+ AnnData object with n_obs × n_vars = 27 × 200
+ obs: 'cell_type'
+ uns: '_omicslog'
+ obsm: 'X_umap'
+ varm: 'gene_stuff'
+
+ Operation log:
+ [2026-05-20 13:49:41] obs: 'cell_type' added
+ [2026-05-20 13:49:43] subset: removed 73 samples (73%), 27 samples remaining
+ [2026-05-20 13:49:46] subset: removed 1800 genes (90%), 200 genes remaining
+ [2026-05-20 13:49:48] obsm: 'X_umap' added
+ [2026-05-20 13:49:48] varm: 'gene_stuff' added
+
+
+
+
+
+
+
+
+
+
+ |
+ Time |
+ Operation |
+ Message |
+
+
+
+
+ | 0 |
+ 2026-05-20 13:49:41 |
+ obs |
+ 'cell_type' added |
+
+
+ | 1 |
+ 2026-05-20 13:49:43 |
+ subset |
+ removed 73 samples (73%), 27 samples remaining |
+
+
+ | 2 |
+ 2026-05-20 13:49:46 |
+ subset |
+ removed 1800 genes (90%), 200 genes remaining |
+
+
+ | 3 |
+ 2026-05-20 13:49:48 |
+ obsm |
+ 'X_umap' added |
+
+
+ | 4 |
+ 2026-05-20 13:49:48 |
+ varm |
+ 'gene_stuff' added |
+
+
+
+
+
+
+
+## Adding layers
+
+
+```python
+logdata.layers["log_transformed"] = np.log1p(logdata.X)
+print(logdata)
+logdata.uns["_omicslog"]
+```
+
+ AnnData object with n_obs × n_vars = 27 × 200
+ obs: 'cell_type'
+ uns: '_omicslog'
+ obsm: 'X_umap'
+ varm: 'gene_stuff'
+ layers: 'log_transformed'
+
+ Operation log:
+ [2026-05-20 13:49:41] obs: 'cell_type' added
+ [2026-05-20 13:49:43] subset: removed 73 samples (73%), 27 samples remaining
+ [2026-05-20 13:49:46] subset: removed 1800 genes (90%), 200 genes remaining
+ [2026-05-20 13:49:48] obsm: 'X_umap' added
+ [2026-05-20 13:49:48] varm: 'gene_stuff' added
+ [2026-05-20 13:49:50] layers: 'log_transformed' added
+
+
+
+
+
+
+
+
+
+
+ |
+ Time |
+ Operation |
+ Message |
+
+
+
+
+ | 0 |
+ 2026-05-20 13:49:41 |
+ obs |
+ 'cell_type' added |
+
+
+ | 1 |
+ 2026-05-20 13:49:43 |
+ subset |
+ removed 73 samples (73%), 27 samples remaining |
+
+
+ | 2 |
+ 2026-05-20 13:49:46 |
+ subset |
+ removed 1800 genes (90%), 200 genes remaining |
+
+
+ | 3 |
+ 2026-05-20 13:49:48 |
+ obsm |
+ 'X_umap' added |
+
+
+ | 4 |
+ 2026-05-20 13:49:48 |
+ varm |
+ 'gene_stuff' added |
+
+
+ | 5 |
+ 2026-05-20 13:49:50 |
+ layers |
+ 'log_transformed' added |
+
+
+
+
-## CI
-GitHub Actions builds both images and runs smoke tests on push/PR.
diff --git a/analysis/py/adata_reference_code.ipynb b/analysis/py/adata_reference_code.ipynb
deleted file mode 100644
index cc84126..0000000
--- a/analysis/py/adata_reference_code.ipynb
+++ /dev/null
@@ -1,243 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "8fb259db",
- "metadata": {},
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "dbb1493c",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "0.12.10\n",
- "AnnData object with n_obs × n_vars = 100 × 2000\n",
- "Index(['Cell_0', 'Cell_1', 'Cell_2', 'Cell_3', 'Cell_4', 'Cell_5', 'Cell_6',\n",
- " 'Cell_7', 'Cell_8', 'Cell_9'],\n",
- " dtype='object')\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/tmp/ipykernel_45018/475607713.py:5: FutureWarning: `__version__` is deprecated, use `importlib.metadata.version('anndata')` instead.\n",
- " print(ad.__version__)\n"
- ]
- }
- ],
- "source": [
- "import numpy as np\n",
- "import pandas as pd\n",
- "import anndata as ad\n",
- "from scipy.sparse import csr_matrix\n",
- "print(ad.__version__)\n",
- "\n",
- "counts = csr_matrix(np.random.poisson(1, size=(100, 2000)), dtype=np.float32)\n",
- "adata = ad.AnnData(counts)\n",
- "print(adata)\n",
- "\n",
- "adata.obs_names = [f\"Cell_{i:d}\" for i in range(adata.n_obs)]\n",
- "adata.var_names = [f\"Gene_{i:d}\" for i in range(adata.n_vars)]\n",
- "print(adata.obs_names[:10])"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "beed1340",
- "metadata": {},
- "source": [
- "# Filtering by Cells"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "f0969e08",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "View of AnnData object with n_obs × n_vars = 2 × 2000"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "adata = adata[[\"Cell_1\", \"Cell_10\"], ]\n",
- "adata"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "3f9bfc1c",
- "metadata": {},
- "source": [
- "# Filtering by cells (.obs)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "id": "a0b17993",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/tmp/ipykernel_45018/910546738.py:2: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
- " adata.obs[\"cell_type\"] = pd.Categorical(ct) # Categoricals are preferred for efficiency\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " cell_type | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | Cell_1 | \n",
- " B | \n",
- "
\n",
- " \n",
- " | Cell_10 | \n",
- " T | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " cell_type\n",
- "Cell_1 B\n",
- "Cell_10 T"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "ct = np.random.choice([\"B\", \"T\", \"Monocyte\"], size=(adata.n_obs,))\n",
- "adata.obs[\"cell_type\"] = pd.Categorical(ct) # Categoricals are preferred for efficiency\n",
- "adata.obs"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "id": "c77a07da",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "View of AnnData object with n_obs × n_vars = 2 × 2000\n",
- " obs: 'cell_type'"
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "\n",
- "adata = adata[adata.obs.cell_type == \"B\"]\n",
- "adata"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "a3c37b5a",
- "metadata": {},
- "source": [
- "# Adding layers"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "id": "fab07d95",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/tmp/ipykernel_44426/2818935080.py:1: ImplicitModificationWarning: Setting element `.layers['log_transformed']` of view, initializing view as actual.\n",
- " adata.layers[\"log_transformed\"] = np.log1p(adata.X)\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "AnnData object with n_obs × n_vars = 2 × 2000\n",
- " obs: 'cell_type'\n",
- " layers: 'log_transformed'"
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "adata.layers[\"log_transformed\"] = np.log1p(adata.X)\n",
- "adata"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": ".venv",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.14"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/analysis/py/omicslog_beta.ipynb b/analysis/py/omicslog_beta.ipynb
deleted file mode 100644
index d194419..0000000
--- a/analysis/py/omicslog_beta.ipynb
+++ /dev/null
@@ -1,553 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "f0b85c1e",
- "metadata": {},
- "outputs": [],
- "source": [
- "from __future__ import annotations\n",
- "\n",
- "from copy import deepcopy\n",
- "from dataclasses import dataclass, field\n",
- "from datetime import datetime\n",
- "from typing import Any\n",
- "\n",
- "import anndata as ad\n",
- "\n",
- "LOG_KEY = \"_omicslog\"\n",
- "\n",
- "\n",
- "def _timestamp() -> str:\n",
- " return datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\")\n",
- "\n",
- "\n",
- "def _format_log_message(operation: str, message: str, ts: str | None = None) -> str:\n",
- " stamp = ts or _timestamp()\n",
- " return f\"[{stamp}] {operation}: {message}\"\n",
- "\n",
- "\n",
- "def _ensure_log_container(adata: ad.AnnData) -> list[str]:\n",
- " current = adata.uns.get(LOG_KEY)\n",
- " if not isinstance(current, list):\n",
- " adata.uns[LOG_KEY] = []\n",
- " return adata.uns[LOG_KEY]\n",
- "\n",
- "\n",
- "def _append_log_messages(adata: ad.AnnData, messages: list[str] | tuple[str, ...]) -> None:\n",
- " if not messages:\n",
- " return\n",
- " _ensure_log_container(adata).extend(messages)\n",
- "\n",
- "\n",
- "def _inherit_and_append(\n",
- " parent: ad.AnnData,\n",
- " child: ad.AnnData,\n",
- " messages: list[str] | tuple[str, ...],\n",
- ") -> None:\n",
- " child.uns[LOG_KEY] = list(_ensure_log_container(parent))\n",
- " _append_log_messages(child, messages)\n",
- "\n",
- "def _parent_set(obj, attr: str, value) -> None:\n",
- " \"\"\"Set an attribute via the first parent class that defines it.\n",
- " Works for both plain properties (.fset) and custom descriptors (.__set__).\n",
- " \"\"\"\n",
- " for base in type(obj).__mro__[1:]:\n",
- " if attr in base.__dict__:\n",
- " base.__dict__[attr].__set__(obj, value)\n",
- " return\n",
- " object.__setattr__(obj, attr, value)\n",
- "\n",
- "class _LoggingProxy:\n",
- " \"\"\"\n",
- " Transparent proxy for dict-like AnnData components (layers, obsm, varm, ...).\n",
- " Intercepts __setitem__ and __delitem__ to log mutations automatically.\n",
- " \"\"\"\n",
- "\n",
- " def __init__(self, wrapped, owner: \"LoggedAnnDataStandalone\", label: str):\n",
- " object.__setattr__(self, \"_w\", wrapped)\n",
- " object.__setattr__(self, \"_owner\", owner)\n",
- " object.__setattr__(self, \"_label\", label)\n",
- "\n",
- " def __setitem__(self, key: str, value) -> None:\n",
- " w = object.__getattribute__(self, \"_w\")\n",
- " owner = object.__getattribute__(self, \"_owner\")\n",
- " label = object.__getattribute__(self, \"_label\")\n",
- " verb = \"updated\" if key in w else \"added\"\n",
- " w[key] = value\n",
- " _append_log_messages(owner, [_format_log_message(label, f\"'{key}' {verb}\")])\n",
- "\n",
- " def __delitem__(self, key: str) -> None:\n",
- " w = object.__getattribute__(self, \"_w\")\n",
- " owner = object.__getattribute__(self, \"_owner\")\n",
- " label = object.__getattribute__(self, \"_label\")\n",
- " del w[key]\n",
- " _append_log_messages(owner, [_format_log_message(label, f\"'{key}' removed\")])\n",
- "\n",
- " def __getitem__(self, key):\n",
- " return object.__getattribute__(self, \"_w\")[key]\n",
- "\n",
- " def __getattr__(self, name):\n",
- " return getattr(object.__getattribute__(self, \"_w\"), name)\n",
- "\n",
- " def __contains__(self, key):\n",
- " return key in object.__getattribute__(self, \"_w\")\n",
- "\n",
- " def __iter__(self):\n",
- " return iter(object.__getattribute__(self, \"_w\"))\n",
- "\n",
- " def __len__(self):\n",
- " return len(object.__getattribute__(self, \"_w\"))\n",
- "\n",
- " def __repr__(self):\n",
- " return repr(object.__getattribute__(self, \"_w\"))\n",
- "\n",
- "\n",
- "@dataclass\n",
- "class AnnDataSnapshot:\n",
- " \"\"\"Captures the full state of an AnnData object for diffing.\"\"\"\n",
- " n_obs: int\n",
- " n_vars: int\n",
- " obs_cols: list[str] = field(default_factory=list)\n",
- " var_cols: list[str] = field(default_factory=list)\n",
- " layers: list[str] = field(default_factory=list)\n",
- " obsm: list[str] = field(default_factory=list)\n",
- " varm: list[str] = field(default_factory=list)\n",
- " obsp: list[str] = field(default_factory=list)\n",
- " varp: list[str] = field(default_factory=list)\n",
- "\n",
- " @classmethod\n",
- " def from_anndata(cls, adata: ad.AnnData) -> \"AnnDataSnapshot\":\n",
- " return cls(\n",
- " n_obs=adata.n_obs,\n",
- " n_vars=adata.n_vars,\n",
- " obs_cols=list(adata.obs.columns),\n",
- " var_cols=list(adata.var.columns),\n",
- " layers=list(adata.layers.keys()),\n",
- " obsm=list(adata.obsm.keys()),\n",
- " varm=list(adata.varm.keys()),\n",
- " obsp=list(adata.obsp.keys()),\n",
- " varp=list(adata.varp.keys()),\n",
- " )\n",
- "\n",
- "\n",
- "def _diff_keys(\n",
- " pre: list[str],\n",
- " post: list[str],\n",
- " label: str,\n",
- " operation: str,\n",
- " ts: str,\n",
- ") -> list[str]:\n",
- " msgs = []\n",
- " for k in sorted(set(pre) - set(post)):\n",
- " msgs.append(_format_log_message(operation, f\"{label} removed: '{k}'\", ts))\n",
- " for k in sorted(set(post) - set(pre)):\n",
- " msgs.append(_format_log_message(operation, f\"{label} added: '{k}'\", ts))\n",
- " return msgs\n",
- "\n",
- "\n",
- "def _subset_messages(\n",
- " pre: AnnDataSnapshot,\n",
- " post: AnnDataSnapshot,\n",
- " operation: str = \"subset\",\n",
- ") -> list[str]:\n",
- " msgs: list[str] = []\n",
- " ts = _timestamp()\n",
- "\n",
- " if pre.n_vars != post.n_vars:\n",
- " removed = pre.n_vars - post.n_vars\n",
- " pct = round((removed / pre.n_vars) * 100) if pre.n_vars else 0\n",
- " msgs.append(_format_log_message(\n",
- " operation,\n",
- " f\"removed {removed} genes ({pct}%), {post.n_vars} genes remaining\",\n",
- " ts,\n",
- " ))\n",
- "\n",
- " if pre.n_obs != post.n_obs:\n",
- " removed = pre.n_obs - post.n_obs\n",
- " pct = round((removed / pre.n_obs) * 100) if pre.n_obs else 0\n",
- " msgs.append(_format_log_message(\n",
- " operation,\n",
- " f\"removed {removed} samples ({pct}%), {post.n_obs} samples remaining\",\n",
- " ts,\n",
- " ))\n",
- "\n",
- " msgs += _diff_keys(pre.obs_cols, post.obs_cols, \"obs column\", operation, ts)\n",
- " msgs += _diff_keys(pre.var_cols, post.var_cols, \"var column\", operation, ts)\n",
- " msgs += _diff_keys(pre.layers, post.layers, \"layer\", operation, ts)\n",
- " msgs += _diff_keys(pre.obsm, post.obsm, \"obsm\", operation, ts)\n",
- " msgs += _diff_keys(pre.varm, post.varm, \"varm\", operation, ts)\n",
- " msgs += _diff_keys(pre.obsp, post.obsp, \"obsp\", operation, ts)\n",
- " msgs += _diff_keys(pre.varp, post.varp, \"varp\", operation, ts)\n",
- "\n",
- " return msgs\n",
- "\n",
- "\n",
- "class LoggedAnnDataStandalone(ad.AnnData):\n",
- " \"\"\"Standalone subclass strategy with local logging helpers and message style.\"\"\"\n",
- "\n",
- " def __init__(self, *args: Any, **kwargs: Any):\n",
- " super().__init__(*args, **kwargs)\n",
- " _ensure_log_container(self)\n",
- "\n",
- " @classmethod\n",
- " def _safe_component_copy(cls, value):\n",
- " return value.copy() if hasattr(value, \"copy\") else deepcopy(value)\n",
- "\n",
- " @classmethod\n",
- " def from_anndata(cls, adata: ad.AnnData) -> \"LoggedAnnDataStandalone\":\n",
- " if isinstance(adata, cls):\n",
- " _ensure_log_container(adata)\n",
- " return adata\n",
- "\n",
- " kwargs: dict[str, Any] = {\n",
- " \"X\": cls._safe_component_copy(adata.X) if adata.X is not None else None,\n",
- " \"obs\": adata.obs.copy(),\n",
- " \"var\": adata.var.copy(),\n",
- " \"uns\": deepcopy(dict(adata.uns)),\n",
- " \"obsm\": {k: cls._safe_component_copy(v) for k, v in adata.obsm.items()},\n",
- " \"varm\": {k: cls._safe_component_copy(v) for k, v in adata.varm.items()},\n",
- " \"layers\": {k: cls._safe_component_copy(v) for k, v in adata.layers.items()},\n",
- " \"obsp\": {k: cls._safe_component_copy(v) for k, v in adata.obsp.items()},\n",
- " \"varp\": {k: cls._safe_component_copy(v) for k, v in adata.varp.items()},\n",
- " }\n",
- " if adata.raw is not None:\n",
- " kwargs[\"raw\"] = {\n",
- " \"X\": cls._safe_component_copy(adata.raw.X),\n",
- " \"var\": adata.raw.var.copy(),\n",
- " \"varm\": {k: cls._safe_component_copy(v) for k, v in adata.raw.varm.items()},\n",
- " }\n",
- "\n",
- " logged = cls(**kwargs)\n",
- " _ensure_log_container(logged)\n",
- " return logged\n",
- "\n",
- " # --- proxied properties: each needs a getter AND a setter ---\n",
- "\n",
- " @property\n",
- " def layers(self):\n",
- " return _LoggingProxy(super().layers, self, \"layers\")\n",
- "\n",
- " @layers.setter\n",
- " def layers(self, value):\n",
- " _parent_set(self, \"layers\", value)\n",
- "\n",
- " @property\n",
- " def obsm(self):\n",
- " return _LoggingProxy(super().obsm, self, \"obsm\")\n",
- "\n",
- " @obsm.setter\n",
- " def obsm(self, value):\n",
- " _parent_set(self, \"obsm\", value)\n",
- "\n",
- " @property\n",
- " def varm(self):\n",
- " return _LoggingProxy(super().varm, self, \"varm\")\n",
- "\n",
- " @varm.setter\n",
- " def varm(self, value):\n",
- " _parent_set(self, \"varm\", value)\n",
- "\n",
- " @property\n",
- " def obsp(self):\n",
- " return _LoggingProxy(super().obsp, self, \"obsp\")\n",
- "\n",
- " @obsp.setter\n",
- " def obsp(self, value):\n",
- " _parent_set(self, \"obsp\", value)\n",
- "\n",
- " @property\n",
- " def varp(self):\n",
- " return _LoggingProxy(super().varp, self, \"varp\")\n",
- "\n",
- " @varp.setter\n",
- " def varp(self, value):\n",
- " _parent_set(self, \"varp\", value)\n",
- "\n",
- " @property\n",
- " def obs(self):\n",
- " return _LoggingProxy(super().obs, self, \"obs\")\n",
- "\n",
- " @obs.setter\n",
- " def obs(self, value):\n",
- " _parent_set(self, \"obs\", value)\n",
- "\n",
- " @property\n",
- " def var(self):\n",
- " return _LoggingProxy(super().var, self, \"var\")\n",
- "\n",
- " @var.setter\n",
- " def var(self, value):\n",
- " ad.AnnData.var.fset(self, value)\n",
- "\n",
- " # --- snapshot & subsetting ---\n",
- "\n",
- " def _snapshot(self) -> AnnDataSnapshot:\n",
- " return AnnDataSnapshot.from_anndata(self)\n",
- "\n",
- " def __getitem__(self, index):\n",
- " pre = self._snapshot()\n",
- " result = super().__getitem__(index)\n",
- " logged_result = self.from_anndata(result)\n",
- " msgs = _subset_messages(pre, logged_result._snapshot(), operation=\"subset\")\n",
- " _inherit_and_append(self, logged_result, msgs)\n",
- " return logged_result\n",
- "\n",
- " def _inplace_subset(self, index):\n",
- " pre = self._snapshot()\n",
- " super()._inplace_subset(index)\n",
- " _append_log_messages(self, _subset_messages(pre, self._snapshot(), operation=\"subset\"))\n",
- "\n",
- " def _operation_log_block(self) -> str:\n",
- " logs = self.uns.get(LOG_KEY, [])\n",
- " if not logs:\n",
- " return \"\"\n",
- " return \"\\n\\nOperation log:\\n\" + \"\\n\".join(str(x) for x in logs)\n",
- "\n",
- " def __repr__(self) -> str:\n",
- " return super().__repr__() + self._operation_log_block()\n",
- "\n",
- " def __str__(self) -> str:\n",
- " return self.__repr__()\n",
- "\n",
- " def operation_log(self) -> list[str]:\n",
- " return list(self.uns.get(LOG_KEY, []))\n",
- "\n",
- "\n",
- "def log_start(adata: ad.AnnData) -> LoggedAnnDataStandalone:\n",
- " return LoggedAnnDataStandalone.from_anndata(adata)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "a256216a",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "0.12.10\n",
- "AnnData object with n_obs × n_vars = 100 × 2000\n",
- " uns: '_omicslog'\n",
- "AnnData object with n_obs × n_vars = 100 × 2000\n",
- " obs: 'cell_type'\n",
- " uns: '_omicslog'\n",
- "\n",
- "Operation log:\n",
- "[2026-03-27 16:38:57] obs: 'cell_type' added\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/tmp/ipykernel_785154/2927559619.py:5: FutureWarning: `__version__` is deprecated, use `importlib.metadata.version('anndata')` instead.\n",
- " print(ad.__version__)\n"
- ]
- }
- ],
- "source": [
- "import numpy as np\n",
- "import pandas as pd\n",
- "import anndata as ad\n",
- "from scipy.sparse import csr_matrix\n",
- "print(ad.__version__)\n",
- "\n",
- "counts = csr_matrix(np.random.poisson(1, size=(100, 2000)), dtype=np.float32)\n",
- "adata = ad.AnnData(counts)\n",
- "\n",
- "adata.obs_names = [f\"Cell_{i:d}\" for i in range(adata.n_obs)]\n",
- "adata.var_names = [f\"Gene_{i:d}\" for i in range(adata.n_vars)]\n",
- "\n",
- "logdata = log_start(adata)\n",
- "print(logdata)\n",
- "\n",
- "ct = np.random.choice([\"B\", \"T\", \"Monocyte\"], size=(logdata.n_obs,))\n",
- "logdata.obs[\"cell_type\"] = pd.Categorical(ct) # Categoricals are preferred for efficiency\n",
- "print(logdata)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "8e0e24d1",
- "metadata": {},
- "source": [
- "# Fltrating by Cells (.obs)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "44d8de87",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "AnnData object with n_obs × n_vars = 31 × 2000\n",
- " obs: 'cell_type'\n",
- " uns: '_omicslog'\n",
- "\n",
- "Operation log:\n",
- "[2026-03-27 16:38:57] obs: 'cell_type' added\n",
- "[2026-03-27 16:38:57] subset: removed 69 samples (69%), 31 samples remaining"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "logdata = logdata[logdata.obs.cell_type == \"B\"]\n",
- "logdata"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "0f4ac517",
- "metadata": {},
- "source": [
- "# Filtering by Genes (.var)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "id": "af577b51",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "AnnData object with n_obs × n_vars = 31 × 200\n",
- " obs: 'cell_type'\n",
- " uns: '_omicslog'\n",
- "\n",
- "Operation log:\n",
- "[2026-03-27 16:38:57] obs: 'cell_type' added\n",
- "[2026-03-27 16:38:57] subset: removed 69 samples (69%), 31 samples remaining\n",
- "[2026-03-27 16:38:57] subset: removed 1800 genes (90%), 200 genes remaining"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "logdata = logdata[:,logdata.var_names.str.endswith(\"1\")]\n",
- "logdata"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "f1a77680",
- "metadata": {},
- "source": [
- "# Adding observatons and variables"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "id": "70952826",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "AnnData object with n_obs × n_vars = 31 × 200\n",
- " obs: 'cell_type'\n",
- " uns: '_omicslog'\n",
- " obsm: 'X_umap'\n",
- " varm: 'gene_stuff'\n",
- "\n",
- "Operation log:\n",
- "[2026-03-27 16:38:57] obs: 'cell_type' added\n",
- "[2026-03-27 16:38:57] subset: removed 69 samples (69%), 31 samples remaining\n",
- "[2026-03-27 16:38:57] subset: removed 1800 genes (90%), 200 genes remaining\n",
- "[2026-03-27 16:38:57] obsm: 'X_umap' added\n",
- "[2026-03-27 16:38:57] varm: 'gene_stuff' added"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "logdata.obsm[\"X_umap\"] = np.random.normal(0, 1, size=(logdata.n_obs, 2))\n",
- "logdata.varm[\"gene_stuff\"] = np.random.normal(0, 1, size=(logdata.n_vars, 5))\n",
- "logdata"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "d9a5c3b4",
- "metadata": {},
- "source": [
- "# Adding layers"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "529860ba",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "AnnData object with n_obs × n_vars = 31 × 200\n",
- " obs: 'cell_type'\n",
- " uns: '_omicslog'\n",
- " obsm: 'X_umap'\n",
- " varm: 'gene_stuff'\n",
- " layers: 'log_transformed'\n",
- "\n",
- "Operation log:\n",
- "[2026-03-27 16:38:57] obs: 'cell_type' added\n",
- "[2026-03-27 16:38:57] subset: removed 69 samples (69%), 31 samples remaining\n",
- "[2026-03-27 16:38:57] subset: removed 1800 genes (90%), 200 genes remaining\n",
- "[2026-03-27 16:38:57] obsm: 'X_umap' added\n",
- "[2026-03-27 16:38:57] varm: 'gene_stuff' added\n",
- "[2026-03-27 16:38:57] layers: 'log_transformed' added"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "logdata.layers[\"log_transformed\"] = np.log1p(logdata.X)\n",
- "logdata"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": ".venv",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.14"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/analysis/py/smoke.py b/analysis/py/smoke.py
deleted file mode 100644
index eb17e7b..0000000
--- a/analysis/py/smoke.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import sys
-from pathlib import Path
-
-def main() -> None:
- print("Python:", sys.version)
-
- # Minimal deps: numpy + sklearn (small, common). Add torch as needed.
- import numpy as np # noqa: F401
- from sklearn.linear_model import LogisticRegression
-
- X = np.random.randn(100, 5)
- y = (X[:, 0] + 0.1 * X[:, 1] > 0).astype(int)
-
- clf = LogisticRegression(max_iter=200)
- clf.fit(X, y)
- acc = clf.score(X, y)
-
- out = Path("results")
- out.mkdir(exist_ok=True)
- (out / "smoke_metrics.txt").write_text(f"train_acc={acc:.3f}\n")
-
- print("Python smoke test completed; wrote results/smoke_metrics.txt")
-
-
-if __name__ == "__main__":
- main()
-
diff --git a/analysis/r/smoke.R b/analysis/r/smoke.R
deleted file mode 100644
index fdc5fb0..0000000
--- a/analysis/r/smoke.R
+++ /dev/null
@@ -1,26 +0,0 @@
-# Minimal smoke test for R container
-message("R version: ", R.version.string)
-
-# Ensure renv works
-if (!requireNamespace("renv", quietly = TRUE)) {
- stop("renv not installed")
-}
-
-# Optional: install a tiny plotting stack in project env if not present
-pkgs <- c("ggplot2")
-missing <- pkgs[!vapply(pkgs, requireNamespace, logical(1), quietly = TRUE)]
-if (length(missing) > 0) {
- message("Installing missing packages: ", paste(missing, collapse = ", "))
- install.packages(missing, repos = "https://cloud.r-project.org")
-}
-
-library(ggplot2)
-
-df <- data.frame(x = 1:10, y = (1:10)^2)
-p <- ggplot(df, aes(x, y)) + geom_point() + ggtitle("Smoke test plot")
-
-dir.create("results", showWarnings = FALSE)
-ggsave("results/smoke_plot.png", p, width = 6, height = 4, dpi = 150)
-
-message("R smoke test completed; wrote results/smoke_plot.png")
-
diff --git a/analysis/r/template.qmd b/analysis/r/template.qmd
deleted file mode 100644
index e71273b..0000000
--- a/analysis/r/template.qmd
+++ /dev/null
@@ -1,22 +0,0 @@
----
-title: ""
-author: Juan Henao
-date: '`r format(Sys.time(), "%d %B, %Y")`'
-description: ""
-title-block-banner: "black"
-quarto:
- components:
- panel-tabset:
- max_items: 10
-format:
- html:
- embed-resources: true
- smooth-scroll: true
- anchor-sections: true
- number-sections: true
- toc: true
- toc-location: left
- code-fold: true
- theme: cerulean
-editor: visual
----
diff --git a/demo.ipynb b/demo.ipynb
new file mode 100644
index 0000000..9cf0481
--- /dev/null
+++ b/demo.ipynb
@@ -0,0 +1,585 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "e19092cb",
+ "metadata": {},
+ "source": [
+ "# Omicslog\n",
+ "\n",
+ "## Importing packages"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "14411cbf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from omicslog import log_start\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import anndata as ad\n",
+ "from scipy.sparse import csr_matrix\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "99bec3f9",
+ "metadata": {},
+ "source": [
+ "\n",
+ "
📝 Note\n",
+ "The AnnData object were generated using code from the original
AnnData documentation.\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "c285a9b4",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "AnnData object with n_obs × n_vars = 100 × 2000\n",
+ " uns: '_omicslog'\n",
+ "AnnData object with n_obs × n_vars = 100 × 2000\n",
+ " obs: 'cell_type'\n",
+ " uns: '_omicslog'\n",
+ "\n",
+ "Operation log:\n",
+ "[2026-05-20 13:49:41] obs: 'cell_type' added\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Time | \n",
+ " Operation | \n",
+ " Message | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 2026-05-20 13:49:41 | \n",
+ " obs | \n",
+ " 'cell_type' added | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Time Operation Message\n",
+ "0 2026-05-20 13:49:41 obs 'cell_type' added"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "counts = csr_matrix(np.random.poisson(1, size=(100, 2000)), dtype=np.float32)\n",
+ "adata = ad.AnnData(counts)\n",
+ "\n",
+ "adata.obs_names = [f\"Cell_{i:d}\" for i in range(adata.n_obs)]\n",
+ "adata.var_names = [f\"Gene_{i:d}\" for i in range(adata.n_vars)]\n",
+ "\n",
+ "logdata = log_start(adata)\n",
+ "print(logdata)\n",
+ "\n",
+ "ct = np.random.choice([\"B\", \"T\", \"Monocyte\"], size=(logdata.n_obs,))\n",
+ "logdata.obs[\"cell_type\"] = pd.Categorical(ct) # Categoricals are preferred for efficiency\n",
+ "print(logdata)\n",
+ "logdata.uns[\"_omicslog\"]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b88c830d",
+ "metadata": {},
+ "source": [
+ "## Fltrating by Cells (.obs)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "b88b1e3a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "AnnData object with n_obs × n_vars = 27 × 2000\n",
+ " obs: 'cell_type'\n",
+ " uns: '_omicslog'\n",
+ "\n",
+ "Operation log:\n",
+ "[2026-05-20 13:49:41] obs: 'cell_type' added\n",
+ "[2026-05-20 13:49:43] subset: removed 73 samples (73%), 27 samples remaining\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Time | \n",
+ " Operation | \n",
+ " Message | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 2026-05-20 13:49:41 | \n",
+ " obs | \n",
+ " 'cell_type' added | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2026-05-20 13:49:43 | \n",
+ " subset | \n",
+ " removed 73 samples (73%), 27 samples remaining | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Time Operation \\\n",
+ "0 2026-05-20 13:49:41 obs \n",
+ "1 2026-05-20 13:49:43 subset \n",
+ "\n",
+ " Message \n",
+ "0 'cell_type' added \n",
+ "1 removed 73 samples (73%), 27 samples remaining "
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "logdata = logdata[logdata.obs.cell_type == \"B\"]\n",
+ "print(logdata)\n",
+ "logdata.uns[\"_omicslog\"]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6bcdb40a",
+ "metadata": {},
+ "source": [
+ "## Filtering by Genes (.var)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "8f7ed85f",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "AnnData object with n_obs × n_vars = 27 × 200\n",
+ " obs: 'cell_type'\n",
+ " uns: '_omicslog'\n",
+ "\n",
+ "Operation log:\n",
+ "[2026-05-20 13:49:41] obs: 'cell_type' added\n",
+ "[2026-05-20 13:49:43] subset: removed 73 samples (73%), 27 samples remaining\n",
+ "[2026-05-20 13:49:46] subset: removed 1800 genes (90%), 200 genes remaining\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Time | \n",
+ " Operation | \n",
+ " Message | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 2026-05-20 13:49:41 | \n",
+ " obs | \n",
+ " 'cell_type' added | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2026-05-20 13:49:43 | \n",
+ " subset | \n",
+ " removed 73 samples (73%), 27 samples remaining | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2026-05-20 13:49:46 | \n",
+ " subset | \n",
+ " removed 1800 genes (90%), 200 genes remaining | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Time Operation \\\n",
+ "0 2026-05-20 13:49:41 obs \n",
+ "1 2026-05-20 13:49:43 subset \n",
+ "2 2026-05-20 13:49:46 subset \n",
+ "\n",
+ " Message \n",
+ "0 'cell_type' added \n",
+ "1 removed 73 samples (73%), 27 samples remaining \n",
+ "2 removed 1800 genes (90%), 200 genes remaining "
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "logdata = logdata[:,logdata.var_names.str.endswith(\"1\")]\n",
+ "print(logdata)\n",
+ "logdata.uns[\"_omicslog\"]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a679a08e",
+ "metadata": {},
+ "source": [
+ "## Adding observatons and variables"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "0fb2c8b9",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "AnnData object with n_obs × n_vars = 27 × 200\n",
+ " obs: 'cell_type'\n",
+ " uns: '_omicslog'\n",
+ " obsm: 'X_umap'\n",
+ " varm: 'gene_stuff'\n",
+ "\n",
+ "Operation log:\n",
+ "[2026-05-20 13:49:41] obs: 'cell_type' added\n",
+ "[2026-05-20 13:49:43] subset: removed 73 samples (73%), 27 samples remaining\n",
+ "[2026-05-20 13:49:46] subset: removed 1800 genes (90%), 200 genes remaining\n",
+ "[2026-05-20 13:49:48] obsm: 'X_umap' added\n",
+ "[2026-05-20 13:49:48] varm: 'gene_stuff' added\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Time | \n",
+ " Operation | \n",
+ " Message | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 2026-05-20 13:49:41 | \n",
+ " obs | \n",
+ " 'cell_type' added | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2026-05-20 13:49:43 | \n",
+ " subset | \n",
+ " removed 73 samples (73%), 27 samples remaining | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2026-05-20 13:49:46 | \n",
+ " subset | \n",
+ " removed 1800 genes (90%), 200 genes remaining | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2026-05-20 13:49:48 | \n",
+ " obsm | \n",
+ " 'X_umap' added | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2026-05-20 13:49:48 | \n",
+ " varm | \n",
+ " 'gene_stuff' added | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Time Operation \\\n",
+ "0 2026-05-20 13:49:41 obs \n",
+ "1 2026-05-20 13:49:43 subset \n",
+ "2 2026-05-20 13:49:46 subset \n",
+ "3 2026-05-20 13:49:48 obsm \n",
+ "4 2026-05-20 13:49:48 varm \n",
+ "\n",
+ " Message \n",
+ "0 'cell_type' added \n",
+ "1 removed 73 samples (73%), 27 samples remaining \n",
+ "2 removed 1800 genes (90%), 200 genes remaining \n",
+ "3 'X_umap' added \n",
+ "4 'gene_stuff' added "
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "logdata.obsm[\"X_umap\"] = np.random.normal(0, 1, size=(logdata.n_obs, 2))\n",
+ "logdata.varm[\"gene_stuff\"] = np.random.normal(0, 1, size=(logdata.n_vars, 5))\n",
+ "print(logdata)\n",
+ "logdata.uns[\"_omicslog\"]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2021a54b",
+ "metadata": {},
+ "source": [
+ "## Adding layers"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "a8c41222",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "AnnData object with n_obs × n_vars = 27 × 200\n",
+ " obs: 'cell_type'\n",
+ " uns: '_omicslog'\n",
+ " obsm: 'X_umap'\n",
+ " varm: 'gene_stuff'\n",
+ " layers: 'log_transformed'\n",
+ "\n",
+ "Operation log:\n",
+ "[2026-05-20 13:49:41] obs: 'cell_type' added\n",
+ "[2026-05-20 13:49:43] subset: removed 73 samples (73%), 27 samples remaining\n",
+ "[2026-05-20 13:49:46] subset: removed 1800 genes (90%), 200 genes remaining\n",
+ "[2026-05-20 13:49:48] obsm: 'X_umap' added\n",
+ "[2026-05-20 13:49:48] varm: 'gene_stuff' added\n",
+ "[2026-05-20 13:49:50] layers: 'log_transformed' added\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Time | \n",
+ " Operation | \n",
+ " Message | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 2026-05-20 13:49:41 | \n",
+ " obs | \n",
+ " 'cell_type' added | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2026-05-20 13:49:43 | \n",
+ " subset | \n",
+ " removed 73 samples (73%), 27 samples remaining | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2026-05-20 13:49:46 | \n",
+ " subset | \n",
+ " removed 1800 genes (90%), 200 genes remaining | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2026-05-20 13:49:48 | \n",
+ " obsm | \n",
+ " 'X_umap' added | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2026-05-20 13:49:48 | \n",
+ " varm | \n",
+ " 'gene_stuff' added | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 2026-05-20 13:49:50 | \n",
+ " layers | \n",
+ " 'log_transformed' added | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Time Operation \\\n",
+ "0 2026-05-20 13:49:41 obs \n",
+ "1 2026-05-20 13:49:43 subset \n",
+ "2 2026-05-20 13:49:46 subset \n",
+ "3 2026-05-20 13:49:48 obsm \n",
+ "4 2026-05-20 13:49:48 varm \n",
+ "5 2026-05-20 13:49:50 layers \n",
+ "\n",
+ " Message \n",
+ "0 'cell_type' added \n",
+ "1 removed 73 samples (73%), 27 samples remaining \n",
+ "2 removed 1800 genes (90%), 200 genes remaining \n",
+ "3 'X_umap' added \n",
+ "4 'gene_stuff' added \n",
+ "5 'log_transformed' added "
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "logdata.layers[\"log_transformed\"] = np.log1p(logdata.X)\n",
+ "print(logdata)\n",
+ "logdata.uns[\"_omicslog\"]"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.14"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/make_readme.sh b/make_readme.sh
new file mode 100644
index 0000000..cf971a5
--- /dev/null
+++ b/make_readme.sh
@@ -0,0 +1 @@
+poetry run jupyter nbconvert demo.ipynb --to markdown --output README
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index fdd9235..431a26b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,20 +1,33 @@
[tool.poetry]
-name = "project"
+name = "omicslog"
version = "0.1.0"
-description = "CompBio template: Python (Poetry) + R (renv) in separate devcontainers"
-authors = ["Your Name "]
+description = "AnnData subclass that automatically logs mutations to .uns['_omicslog']"
+authors = ["Stefano Mangiola ", "Juan Henao "]
+manteiners = ["Juan Henao "]
+license = "MIT"
readme = "README.md"
-package-mode = false
+homepage = "https://github.com/tidyomics/omicslog_dev"
+repository = "https://github.com/tidyomics/omicslog_dev"
+keywords = ["bioinformatics", "single-cell", "anndata", "logging"]
+classifiers = [
+ "Programming Language :: Python :: 3",
+ "License :: OSI Approved :: MIT License",
+ "Intended Audience :: Science/Research",
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
+]
+packages = [{include = "omicslog", from = "src"}]
[tool.poetry.dependencies]
python = ">=3.11,<3.12"
+anndata = ">=0.10"
numpy = "^2.0.0"
-scikit-learn = "^1.5.0"
[tool.poetry.group.dev.dependencies]
ruff = "^0.6.0"
black = "^24.8.0"
ipykernel = "^7.2.0"
+pytest = "^8.0.0"
+scipy = "^1.14.0"
[build-system]
requires = ["poetry-core"]
@@ -25,4 +38,3 @@ line-length = 100
[tool.black]
line-length = 100
-
diff --git a/src/omicslog/__init__.py b/src/omicslog/__init__.py
new file mode 100644
index 0000000..71c626b
--- /dev/null
+++ b/src/omicslog/__init__.py
@@ -0,0 +1,7 @@
+from omicslog.core import (
+ LoggedAnnDataStandalone,
+ AnnDataSnapshot,
+ log_start,
+)
+
+__all__ = ["LoggedAnnDataStandalone", "AnnDataSnapshot", "log_start"]
diff --git a/src/omicslog/core.py b/src/omicslog/core.py
new file mode 100644
index 0000000..40db784
--- /dev/null
+++ b/src/omicslog/core.py
@@ -0,0 +1,327 @@
+from __future__ import annotations
+
+from copy import deepcopy
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Any
+
+import anndata as ad
+import pandas as pd
+
+LOG_KEY = "_omicslog"
+
+def _safe_deepcopy_dict(d: dict) -> dict:
+ result = {}
+ for k, v in d.items():
+ try:
+ result[k] = deepcopy(v)
+ except (TypeError, Exception):
+ pass
+ return result
+
+
+def _timestamp() -> str:
+ return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+
+def _format_log_message(operation: str, message: str, ts: str | None = None) -> list[str]:
+ stamp = ts or _timestamp()
+ return [stamp, operation, message]
+
+
+def _ensure_log_container(adata: ad.AnnData) -> pd.DataFrame:
+ current = adata.uns.get(LOG_KEY)
+ if not isinstance(current, pd.DataFrame):
+ adata.uns[LOG_KEY] = pd.DataFrame(columns=["Time","Operation","Message"])
+ return adata.uns[LOG_KEY]
+
+
+def _append_log_messages(adata: ad.AnnData, messages: list[str] | tuple[str, ...]) -> None:
+ if not messages:
+ return
+ container = _ensure_log_container(adata)
+ new_rows = pd.DataFrame(messages, columns=["Time", "Operation", "Message"])
+ adata.uns[LOG_KEY] = pd.concat([container, new_rows], ignore_index=True)
+
+def _inherit_and_append(
+ parent: ad.AnnData,
+ child: ad.AnnData,
+ messages: list[str] | tuple[str, ...],
+) -> None:
+ child.uns[LOG_KEY] = _ensure_log_container(parent)
+ _append_log_messages(child, messages)
+
+def _parent_set(obj, attr: str, value) -> None:
+ """Set an attribute via the first parent class that defines it.
+ Works for both plain properties (.fset) and custom descriptors (.__set__).
+ """
+ for base in type(obj).__mro__[1:]:
+ if attr in base.__dict__:
+ base.__dict__[attr].__set__(obj, value)
+ return
+ object.__setattr__(obj, attr, value)
+
+class _LoggingProxy:
+ """
+ Transparent proxy for dict-like AnnData components (layers, obsm, varm, ...).
+ Intercepts __setitem__ and __delitem__ to log mutations automatically.
+ """
+
+ def __init__(self, wrapped, owner: "LoggedAnnDataStandalone", label: str):
+ object.__setattr__(self, "_w", wrapped)
+ object.__setattr__(self, "_owner", owner)
+ object.__setattr__(self, "_label", label)
+
+ def __setitem__(self, key: str, value) -> None:
+ w = object.__getattribute__(self, "_w")
+ owner = object.__getattribute__(self, "_owner")
+ label = object.__getattribute__(self, "_label")
+ verb = "updated" if key in w else "added"
+ w[key] = value
+ _append_log_messages(owner, [_format_log_message(label, f"'{key}' {verb}")])
+
+ def __delitem__(self, key: str) -> None:
+ w = object.__getattribute__(self, "_w")
+ owner = object.__getattribute__(self, "_owner")
+ label = object.__getattribute__(self, "_label")
+ del w[key]
+ _append_log_messages(owner, [_format_log_message(label, f"'{key}' removed")])
+
+ def __getitem__(self, key):
+ return object.__getattribute__(self, "_w")[key]
+
+ def __getattr__(self, name):
+ return getattr(object.__getattribute__(self, "_w"), name)
+
+ def __contains__(self, key):
+ return key in object.__getattribute__(self, "_w")
+
+ def __iter__(self):
+ return iter(object.__getattribute__(self, "_w"))
+
+ def __len__(self):
+ return len(object.__getattribute__(self, "_w"))
+
+ def __repr__(self):
+ return repr(object.__getattribute__(self, "_w"))
+
+
+@dataclass
+class AnnDataSnapshot:
+ """Captures the full state of an AnnData object for diffing."""
+ n_obs: int
+ n_vars: int
+ obs_cols: list[str] = field(default_factory=list)
+ var_cols: list[str] = field(default_factory=list)
+ layers: list[str] = field(default_factory=list)
+ obsm: list[str] = field(default_factory=list)
+ varm: list[str] = field(default_factory=list)
+ obsp: list[str] = field(default_factory=list)
+ varp: list[str] = field(default_factory=list)
+
+ @classmethod
+ def from_anndata(cls, adata: ad.AnnData) -> "AnnDataSnapshot":
+ return cls(
+ n_obs=adata.n_obs,
+ n_vars=adata.n_vars,
+ obs_cols=list(adata.obs.columns),
+ var_cols=list(adata.var.columns),
+ layers=list(adata.layers.keys()),
+ obsm=list(adata.obsm.keys()),
+ varm=list(adata.varm.keys()),
+ obsp=list(adata.obsp.keys()),
+ varp=list(adata.varp.keys()),
+ )
+
+
+def _diff_keys(
+ pre: list[str],
+ post: list[str],
+ label: str,
+ operation: str,
+ ts: str,
+) -> list[str]:
+ msgs = []
+ for k in sorted(set(pre) - set(post)):
+ msgs.append(_format_log_message(operation, f"{label} removed: '{k}'", ts))
+ for k in sorted(set(post) - set(pre)):
+ msgs.append(_format_log_message(operation, f"{label} added: '{k}'", ts))
+ return msgs
+
+
+def _subset_messages(
+ pre: AnnDataSnapshot,
+ post: AnnDataSnapshot,
+ operation: str = "subset",
+) -> list[str]:
+ msgs: list[str] = []
+ ts = _timestamp()
+
+ if pre.n_vars != post.n_vars:
+ removed = pre.n_vars - post.n_vars
+ pct = round((removed / pre.n_vars) * 100) if pre.n_vars else 0
+ msgs.append(_format_log_message(
+ operation,
+ f"removed {removed} genes ({pct}%), {post.n_vars} genes remaining",
+ ts,
+ ))
+
+ if pre.n_obs != post.n_obs:
+ removed = pre.n_obs - post.n_obs
+ pct = round((removed / pre.n_obs) * 100) if pre.n_obs else 0
+ msgs.append(_format_log_message(
+ operation,
+ f"removed {removed} samples ({pct}%), {post.n_obs} samples remaining",
+ ts,
+ ))
+
+ msgs += _diff_keys(pre.obs_cols, post.obs_cols, "obs column", operation, ts)
+ msgs += _diff_keys(pre.var_cols, post.var_cols, "var column", operation, ts)
+ msgs += _diff_keys(pre.layers, post.layers, "layer", operation, ts)
+ msgs += _diff_keys(pre.obsm, post.obsm, "obsm", operation, ts)
+ msgs += _diff_keys(pre.varm, post.varm, "varm", operation, ts)
+ msgs += _diff_keys(pre.obsp, post.obsp, "obsp", operation, ts)
+ msgs += _diff_keys(pre.varp, post.varp, "varp", operation, ts)
+
+ return msgs
+
+
+class LoggedAnnDataStandalone(ad.AnnData):
+ """Standalone subclass strategy with local logging helpers and message style."""
+
+ def __init__(self, *args: Any, **kwargs: Any):
+ super().__init__(*args, **kwargs)
+ _ensure_log_container(self)
+
+ @classmethod
+ def _safe_component_copy(cls, value):
+ return value.copy() if hasattr(value, "copy") else deepcopy(value)
+
+ @classmethod
+ def from_anndata(cls, adata: ad.AnnData) -> "LoggedAnnDataStandalone":
+ if isinstance(adata, cls):
+ _ensure_log_container(adata)
+ return adata
+
+ kwargs: dict[str, Any] = {
+ "X": cls._safe_component_copy(adata.X) if adata.X is not None else None,
+ "obs": adata.obs.copy(),
+ "var": adata.var.copy(),
+ "uns": _safe_deepcopy_dict(dict(adata.uns)),
+ "obsm": {k: cls._safe_component_copy(v) for k, v in adata.obsm.items()},
+ "varm": {k: cls._safe_component_copy(v) for k, v in adata.varm.items()},
+ "layers": {k: cls._safe_component_copy(v) for k, v in adata.layers.items()},
+ "obsp": {k: cls._safe_component_copy(v) for k, v in adata.obsp.items()},
+ "varp": {k: cls._safe_component_copy(v) for k, v in adata.varp.items()},
+ }
+
+ if adata.raw is not None:
+ kwargs["raw"] = {
+ "X": cls._safe_component_copy(adata.raw.X),
+ "var": adata.raw.var.copy(),
+ "varm": {k: cls._safe_component_copy(v) for k, v in adata.raw.varm.items()},
+ }
+
+ logged = cls(**kwargs)
+ _ensure_log_container(logged)
+ return logged
+
+ # --- proxied properties: each needs a getter AND a setter ---
+
+ @property
+ def layers(self):
+ return _LoggingProxy(super().layers, self, "layers")
+
+ @layers.setter
+ def layers(self, value):
+ _parent_set(self, "layers", value)
+
+ @property
+ def obsm(self):
+ return _LoggingProxy(super().obsm, self, "obsm")
+
+ @obsm.setter
+ def obsm(self, value):
+ _parent_set(self, "obsm", value)
+
+ @property
+ def varm(self):
+ return _LoggingProxy(super().varm, self, "varm")
+
+ @varm.setter
+ def varm(self, value):
+ _parent_set(self, "varm", value)
+
+ @property
+ def obsp(self):
+ return _LoggingProxy(super().obsp, self, "obsp")
+
+ @obsp.setter
+ def obsp(self, value):
+ _parent_set(self, "obsp", value)
+
+ @property
+ def varp(self):
+ return _LoggingProxy(super().varp, self, "varp")
+
+ @varp.setter
+ def varp(self, value):
+ _parent_set(self, "varp", value)
+
+ @property
+ def obs(self):
+ return _LoggingProxy(super().obs, self, "obs")
+
+ @obs.setter
+ def obs(self, value):
+ _parent_set(self, "obs", value)
+
+ @property
+ def var(self):
+ return _LoggingProxy(super().var, self, "var")
+
+ @var.setter
+ def var(self, value):
+ ad.AnnData.var.fset(self, value)
+
+ # --- snapshot & subsetting ---
+
+ def _snapshot(self) -> AnnDataSnapshot:
+ return AnnDataSnapshot.from_anndata(self)
+
+ def __getitem__(self, index):
+ pre = self._snapshot()
+ result = super().__getitem__(index)
+ logged_result = self.from_anndata(result)
+ msgs = _subset_messages(pre, logged_result._snapshot(), operation="subset")
+ _inherit_and_append(self, logged_result, msgs)
+ return logged_result
+
+ def _inplace_subset(self, index):
+ pre = self._snapshot()
+ super()._inplace_subset(index)
+ _append_log_messages(self, _subset_messages(pre, self._snapshot(), operation="subset"))
+
+ def _operation_log_block(self) -> str:
+ logs = self.uns.get(LOG_KEY, [])
+ if isinstance(logs, pd.DataFrame):
+ if logs.empty:
+ return ""
+ rows = logs.apply(lambda r: f"[{r['Time']}] {r['Operation']}: {r['Message']}", axis=1)
+ return "\n\nOperation log:\n" + "\n".join(rows)
+ if not logs:
+ return ""
+ return "\n\nOperation log:\n" + "\n".join(str(x) for x in logs)
+
+ def __repr__(self) -> str:
+ return super().__repr__() + self._operation_log_block()
+
+ def __str__(self) -> str:
+ return self.__repr__()
+
+ def operation_log(self) -> list[str]:
+ return list(self.uns.get(LOG_KEY, []))
+
+
+def log_start(adata: ad.AnnData) -> LoggedAnnDataStandalone:
+ return LoggedAnnDataStandalone.from_anndata(adata)
\ No newline at end of file
diff --git a/renv.lock b/tests/__init__.py
similarity index 100%
rename from renv.lock
rename to tests/__init__.py