diff --git a/app.py b/app.py index d9b3d73d..cec8d4e9 100644 --- a/app.py +++ b/app.py @@ -8,33 +8,35 @@ with open("settings.json", "r") as f: st.session_state.settings = json.load(f) +# Initialize session state for workspace +if "chosen-workspace" not in st.session_state: + if "workspace" in st.session_state: + st.session_state["chosen-workspace"] = str(st.session_state.workspace.stem) + else: + st.session_state["chosen-workspace"] = "default" + if __name__ == '__main__': pages = { str(st.session_state.settings["app-name"]) : [ st.Page(Path("content", "quickstart.py"), title="Quickstart", icon="πŸ‘‹"), st.Page(Path("content", "documentation.py"), title="Documentation", icon="πŸ“–"), ], - "pyOpenMS Toolbox": [ - st.Page(Path("content", "digest.py"), title="In Silico Digest", icon="βœ‚οΈ"), - st.Page(Path("content", "peptide_mz_calculator.py"), title="m/z Calculator", icon="βš–οΈ"), - st.Page(Path("content", "isotope_pattern_generator.py"), title="Isotopic Pattern Calculator", icon="πŸ“Ά"), - st.Page(Path("content", "fragmentation.py"), title="Fragment Ion Generation", icon="πŸ’₯"), - ], "TOPP Workflow Framework": [ st.Page(Path("content", "topp_workflow_file_upload.py"), title="File Upload", icon="πŸ“"), st.Page(Path("content", "topp_workflow_parameter.py"), title="Configure", icon="βš™οΈ"), st.Page(Path("content", "topp_workflow_execution.py"), title="Run", icon="πŸš€"), - st.Page(Path("content", "topp_workflow_results.py"), title="Results", icon="πŸ“Š"), - ], - "pyOpenMS Workflow" : [ - st.Page(Path("content", "file_upload.py"), title="File Upload", icon="πŸ“‚"), - st.Page(Path("content", "raw_data_viewer.py"), title="View MS data", icon="πŸ‘€"), - st.Page(Path("content", "run_example_workflow.py"), title="Run Workflow", icon="βš™οΈ"), - st.Page(Path("content", "download_section.py"), title="Download Results", icon="⬇️"), + # st.Page(Path("content", "topp_workflow_results.py"), title="Results", icon="πŸ“Š"), ], - "Others Topics": [ - st.Page(Path("content", "simple_workflow.py"), title="Simple Workflow", icon="βš™οΈ"), - st.Page(Path("content", "run_subprocess.py"), title="Run Subprocess", icon="πŸ–₯️"), + "Results": [ + st.Page(Path("content", "results_database_search.py"), title="Database Search", icon="πŸ”¬"), + st.Page(Path("content", "results_rescoring.py"), title="Rescoring", icon="πŸ“ˆ"), + st.Page(Path("content", "results_filtered.py"), title="Filtered PSMs", icon="🎯"), + st.Page(Path("content", "results_abundance.py"), title="Abundance", icon="πŸ“‹"), + st.Page(Path("content", "results_volcano.py"), title="Volcano", icon="πŸŒ‹"), + st.Page(Path("content", "results_pca.py"), title="PCA", icon="πŸ“Š"), + st.Page(Path("content", "results_heatmap.py"), title="Heatmap", icon="πŸ”₯"), + # st.Page(Path("content", "results_library.py"), title="Spectral Library", icon="πŸ“š"), + st.Page(Path("content", "results_pathway.py"), title="Pathway Analysis", icon="πŸ§ͺ"), ] } diff --git a/content/results_abundance.py b/content/results_abundance.py new file mode 100644 index 00000000..baf5cb9f --- /dev/null +++ b/content/results_abundance.py @@ -0,0 +1,132 @@ +"""Abundance (ProteomicsLFQ) Results Page.""" +import streamlit as st +import pandas as pd +import numpy as np +from pathlib import Path +from scipy.stats import ttest_ind +from src.common.common import page_setup +from statsmodels.stats.multitest import multipletests +from src.common.results_helpers import get_workflow_dir, get_abundance_data + +params = page_setup() +st.title("Abundance Quantification") + +st.markdown( + """ +View protein and PSM-level quantification from **ProteomicsLFQ**. +This page calculates differential expression statistics between sample groups. +""" +) + +if "workspace" not in st.session_state: + st.warning("Please initialize your workspace first.") + st.stop() + +results_dir = Path(st.session_state["workspace"]) / "topp-workflow" / "results" / "quant_results" +consensus_out = results_dir / "openms_design_protein_openms.csv" + +@st.cache_data +def load_data(file_path): + return pd.read_csv(file_path, sep="\t", comment="#", engine="python") + +if consensus_out.exists(): + + # df = load_data(consensus_out) + # # ratio column removal + # df = df.loc[:, ~df.columns.str.contains('ratio', case=False)] + + pre_processing_tab, protein_tab = st.tabs(["Pre-processing", "Protein Table"]) + + with pre_processing_tab: + # result = get_abundance_data(st.session_state["workspace"]) + # DEBUG: 상세 원인 좜λ ₯ (μž„μ‹œ) + try: + result = get_abundance_data(st.session_state["workspace"]) + except Exception as e: + st.exception(e) + result = None + + if result is None: + ws = st.session_state.get("workspace") + st.error("Debug: get_abundance_data returned None") + st.write("workspace:", ws) + wf = Path(ws) / "topp-workflow" + st.write("workflow dir exists:", wf.exists(), "->", wf) + qdir = wf / "results" / "quant_results" + st.write("quant_dir exists:", qdir.exists(), "->", qdir) + if qdir.exists(): + st.write("csv files:", sorted([p.name for p in qdir.glob('*.csv')])) + # show cached param snapshot if available + try: + from src.workflow.ParameterManager import ParameterManager + pm = ParameterManager(wf) + st.write("parameters keys (sample):", list(pm.get_parameters_from_json().items())[:20]) + except Exception as e: + st.write("Param manager error:", e) + st.stop() + + if result is None: + st.info("πŸ’‘ Please complete the configuration in the 'Configure' page to see results.") + st.stop() + + pivot_df, expr_df, group_map = result + + st.write("### Final Results (Group row removed, Stats added)") + st.dataframe(pivot_df.head(10)) + + + with protein_tab: + st.markdown("### Protein-Level Abundance Table") + + st.info( + "This protein-level table is generated by grouping all PSMs that map to the " + "same protein and aggregating their intensities across samples.\n\n" + "Additionally, log2 fold change and p-values are calculated between sample groups." + ) + + # Display group comparison info + groups = sorted(set(group_map.values())) + if len(groups) >= 2: + group1, group2 = sorted(groups)[:2] + st.info(f"Statistical comparison: **{group2} vs {group1}**") + + exclude_cols = ["protein", "log2FC", "p-value", "p-adj", + "n_proteins", "n_peptides", "protein_score"] + + # Get sample columns (between stats and PeptideSequence) + sample_cols = [c for c in pivot_df.columns if c + not in exclude_cols and "ratio" not in c.lower()] + + # Create bar chart column with log2-transformed values + pivot_df["Intensity"] = pivot_df[sample_cols].apply( + lambda row: [np.log2(v + 1) for v in row], axis=1 + ) + + # Reorder columns: place Intensity after p-value + display_cols = ["protein", "log2FC", "p-value", "Intensity"] + sample_cols + available_cols = [c for c in display_cols if c in pivot_df.columns] + + st.dataframe( + pivot_df[available_cols].sort_values("p-value"), + column_config={ + "Intensity": st.column_config.BarChartColumn( + "Intensity", + help="Sample intensities (log2 scale)", + width="small", + y_min=0, + ), + }, + width="stretch" + ) +else: + st.warning(f"File not found: {consensus_out}") + +st.markdown("---") +st.markdown("**Next steps:** Explore statistical visualizations") +col1, col2, col3 = st.columns(3) +with col1: + st.page_link("content/results_volcano.py", label="Volcano Plot", icon="πŸŒ‹") +with col2: + st.page_link("content/results_pca.py", label="PCA", icon="πŸ“Š") +with col3: + st.page_link("content/results_heatmap.py", label="Heatmap", icon="πŸ”₯") diff --git a/content/results_database_search.py b/content/results_database_search.py new file mode 100644 index 00000000..cd04e69e --- /dev/null +++ b/content/results_database_search.py @@ -0,0 +1,79 @@ +"""Database Search (Comet) Results Page.""" +import streamlit as st +from pathlib import Path +from src.common.common import page_setup +from src.common.results_helpers import get_workflow_dir +from openms_insight import Table, Heatmap, LinePlot, SequenceView, StateManager + +params = page_setup() +st.title("Database Search Results") + +st.markdown( + """ +View peptide-spectrum matches (PSMs) identified by **Comet** database search. +Click on a PSM to view the annotated spectrum and peptide sequence. +""" +) + +st.info( + "**Score:** The e-value (expectation value) represents the expected number of random PSMs " + "with an equal or better score. Lower values indicate higher confidence identifications." +) + +if "workspace" not in st.session_state: + st.warning("Please initialize your workspace first.") + st.stop() + +workflow_dir = get_workflow_dir(st.session_state["workspace"]) +comet_dir = workflow_dir / "results" / "comet_results" +cache_dir = workflow_dir / "results" / "insight_cache" + +if not comet_dir.exists(): + st.info("No database search results available yet. Please run the workflow first.") + st.page_link("content/workflow_run.py", label="Go to Run", icon="πŸš€") + st.stop() + +comet_files = sorted(comet_dir.glob("*.idXML")) + +if not comet_files: + st.warning("No identification output files found.") + st.stop() + +selected_file = st.selectbox( + "Select identification result file", + comet_files, + format_func=lambda x: x.name +) + +cache_id_prefix = selected_file.stem + +# Check if cache exists +if not (cache_dir / f"table_{cache_id_prefix}").is_dir(): + st.warning("Visualization cache not found. Please re-run the workflow.") + st.stop() + +# Initialize state manager for cross-component linking +state_manager = StateManager() + +# Load components from cache (no data parameter needed) +table = Table(cache_id=f"table_{cache_id_prefix}", cache_path=str(cache_dir)) +heatmap = Heatmap(cache_id=f"heatmap_{cache_id_prefix}", cache_path=str(cache_dir)) +seq_view = SequenceView(cache_id=f"seqview_{cache_id_prefix}", cache_path=str(cache_dir)) +line_plot = LinePlot(cache_id=f"lineplot_{cache_id_prefix}", cache_path=str(cache_dir)) + +# Render components +st.subheader("PSM Overview") +heatmap(state_manager=state_manager, height=350) + +st.subheader("PSM Table") +table(state_manager=state_manager, height=533) + +st.subheader("Peptide Sequence") +seq_view(key=f"seqview_{cache_id_prefix}", state_manager=state_manager, height=533) + +st.subheader("MS2 Spectrum") +line_plot(key=f"lineplot_{cache_id_prefix}", state_manager=state_manager, height=450, sequence_view_key=f"seqview_{cache_id_prefix}") + +st.markdown("---") +st.markdown("**Next step:** View rescoring results") +st.page_link("content/results_rescoring.py", label="Go to Rescoring", icon="πŸ“ˆ") diff --git a/content/results_filtered.py b/content/results_filtered.py new file mode 100644 index 00000000..a108ebde --- /dev/null +++ b/content/results_filtered.py @@ -0,0 +1,79 @@ +"""Filtered PSMs Results Page.""" +import streamlit as st +from pathlib import Path +from src.common.common import page_setup +from src.common.results_helpers import get_workflow_dir +from openms_insight import Table, Heatmap, LinePlot, SequenceView, StateManager + +params = page_setup() +st.title("Filtered PSMs") + +st.markdown( + """ +View FDR-controlled peptide identifications after **IDFilter** processing. +These are high-confidence PSMs that passed the specified FDR threshold. +Click on a PSM to view the annotated spectrum and peptide sequence. +""" +) + +st.info( + "**Score:** The q-value represents the minimum false discovery rate (FDR) at which this PSM " + "would be accepted. Lower values indicate higher confidence identifications." +) + +if "workspace" not in st.session_state: + st.warning("Please initialize your workspace first.") + st.stop() + +workflow_dir = get_workflow_dir(st.session_state["workspace"]) +filter_dir = workflow_dir / "results" / "psm_filter" +cache_dir = workflow_dir / "results" / "insight_cache" + +if not filter_dir.exists(): + st.info("No filtered results available yet. Please run the workflow first.") + st.stop() + +filter_files = sorted(filter_dir.glob("*.idXML")) + +if not filter_files: + st.warning("No filtering output files found.") + st.stop() + +selected_file = st.selectbox( + "Select filtering result file", + filter_files, + format_func=lambda x: x.name +) + +cache_id_prefix = selected_file.stem + +# Check if cache exists +if not (cache_dir / f"table_{cache_id_prefix}").is_dir(): + st.warning("Visualization cache not found. Please re-run the workflow.") + st.stop() + +# Initialize state manager for cross-component linking +state_manager = StateManager() + +# Load components from cache (no data parameter needed) +table = Table(cache_id=f"table_{cache_id_prefix}", cache_path=str(cache_dir)) +heatmap = Heatmap(cache_id=f"heatmap_{cache_id_prefix}", cache_path=str(cache_dir)) +seq_view = SequenceView(cache_id=f"seqview_{cache_id_prefix}", cache_path=str(cache_dir)) +line_plot = LinePlot(cache_id=f"lineplot_{cache_id_prefix}", cache_path=str(cache_dir)) + +# Render components +st.subheader("PSM Overview") +heatmap(state_manager=state_manager, height=350) + +st.subheader("PSM Table") +table(state_manager=state_manager, height=533) + +st.subheader("Peptide Sequence") +seq_view(key=f"seqview_{cache_id_prefix}", state_manager=state_manager, height=533) + +st.subheader("MS2 Spectrum") +line_plot(key=f"lineplot_{cache_id_prefix}", state_manager=state_manager, height=450, sequence_view_key=f"seqview_{cache_id_prefix}") + +st.markdown("---") +st.markdown("**Next step:** View abundance quantification") +st.page_link("content/results_abundance.py", label="Go to Abundance", icon="πŸ“‹") diff --git a/content/results_heatmap.py b/content/results_heatmap.py new file mode 100644 index 00000000..5a7ba470 --- /dev/null +++ b/content/results_heatmap.py @@ -0,0 +1,76 @@ +"""Heatmap Results Page.""" +import streamlit as st +import numpy as np +import plotly.express as px +from scipy.cluster.hierarchy import linkage, leaves_list +from scipy.spatial.distance import pdist +from src.common.common import page_setup +from src.common.results_helpers import get_abundance_data + +params = page_setup() +st.title("Heatmap") + +st.markdown( + """ +Hierarchically clustered heatmap of protein-level abundance (Z-score normalized). +Proteins and samples are ordered by similarity. +""" +) + +if "workspace" not in st.session_state: + st.warning("Please initialize your workspace first.") + st.stop() + +result = get_abundance_data(st.session_state["workspace"]) +if result is None: + st.info("Abundance data not available. Please run the workflow and configure sample groups first.") + st.page_link("content/results_abundance.py", label="Go to Abundance", icon="πŸ“‹") + st.stop() + +pivot_df, expr_df, group_map = result + +top_n = st.slider("Number of proteins", 20, 200, 50, key="heatmap_top_n") + +var_series = expr_df.var(axis=1) +top_proteins = var_series.sort_values(ascending=False).head(top_n).index +heatmap_df = expr_df.loc[top_proteins] +heatmap_z = heatmap_df.sub(heatmap_df.mean(axis=1), axis=0).div(heatmap_df.std(axis=1), axis=0) +heatmap_z = heatmap_z.replace([np.inf, -np.inf], np.nan).dropna() + +if not heatmap_z.empty: + row_linkage = linkage(pdist(heatmap_z.values), method="average") + row_order = leaves_list(row_linkage) + + col_linkage = linkage(pdist(heatmap_z.T.values), method="average") + col_order = leaves_list(col_linkage) + + heatmap_clustered = heatmap_z.iloc[row_order, col_order] + + fig_heatmap = px.imshow( + heatmap_clustered, + labels=dict(x="Sample", y="Protein", color="Z-score"), + aspect="auto", + color_continuous_scale=[[0.0, "#3b6fb6"], [0.5, "white"], [1.0, "#b40426"]], + zmin=-3, zmax=3 + ) + + fig_heatmap.update_layout( + height=700, + xaxis={'side': 'bottom'}, + yaxis={'side': 'left'} + ) + + fig_heatmap.update_xaxes(tickfont=dict(size=10)) + fig_heatmap.update_yaxes(tickfont=dict(size=8)) + + st.plotly_chart(fig_heatmap, width="stretch") +else: + st.warning("Insufficient data to generate the heatmap.") + +st.markdown("---") +st.markdown("**Other visualizations:**") +col1, col2 = st.columns(2) +with col1: + st.page_link("content/results_volcano.py", label="Volcano Plot", icon="πŸŒ‹") +with col2: + st.page_link("content/results_pca.py", label="PCA", icon="πŸ“Š") diff --git a/content/results_pathway.py b/content/results_pathway.py new file mode 100644 index 00000000..e6b13760 --- /dev/null +++ b/content/results_pathway.py @@ -0,0 +1,253 @@ +import mygene +import streamlit as st +import pandas as pd +import numpy as np +import plotly.express as px +from pathlib import Path +from collections import defaultdict +from scipy.stats import fisher_exact +from src.common.common import page_setup +from src.common.results_helpers import get_abundance_data + + +# ================================ +# Page setup +# ================================ +params = page_setup() +st.title("ProteomicsLFQ Results") + +# ================================ +# Workspace check +# ================================ +if "workspace" not in st.session_state: + st.warning("Please initialize your workspace first.") + st.stop() + +# ================================ +# _run_go_enrichment function +# ================================ +def _run_go_enrichment(pivot_df: pd.DataFrame, results_dir: Path): + p_cutoff = 0.05 + fc_cutoff = 1.0 + + analysis_df = pivot_df.dropna(subset=["p-value", "log2FC"]).copy() + + if analysis_df.empty: + st.error("No valid statistical data found for GO enrichment.") + st.write("❗ analysis_df is empty") + else: + with st.spinner("Fetching GO terms from MyGene.info API..."): + mg = mygene.MyGeneInfo() + + def get_clean_uniprot(name): + parts = str(name).split("|") + return parts[1] if len(parts) >= 2 else parts[0] + + analysis_df["UniProt"] = analysis_df["protein"].apply(get_clean_uniprot) + + bg_ids = analysis_df["UniProt"].dropna().astype(str).unique().tolist() + fg_ids = analysis_df[ + (analysis_df["p-value"] < p_cutoff) & + (analysis_df["log2FC"].abs() >= fc_cutoff) + ]["UniProt"].dropna().astype(str).unique().tolist() + # st.write("βœ… get_clean_uniprot applied") + + if len(fg_ids) < 3: + st.warning( + f"Not enough significant proteins " + f"(p < {p_cutoff}, |log2FC| β‰₯ {fc_cutoff}). " + f"Found: {len(fg_ids)}" + ) + st.write("❗ Not enough significant proteins") + else: + res_list = mg.querymany( + bg_ids, scopes="uniprot", fields="go", as_dataframe=False + ) + res_go = pd.DataFrame(res_list) + if "notfound" in res_go.columns: + res_go = res_go[res_go["notfound"] != True] + + def extract_go_terms(go_data, go_type): + if not isinstance(go_data, dict) or go_type not in go_data: + return [] + terms = go_data[go_type] + if isinstance(terms, dict): + terms = [terms] + return list({t.get("term") for t in terms if "term" in t}) + + for go_type in ["BP", "CC", "MF"]: + res_go[f"{go_type}_terms"] = res_go["go"].apply( + lambda x: extract_go_terms(x, go_type) + ) + + annotated_ids = set(res_go["query"].astype(str)) + fg_set = annotated_ids.intersection(fg_ids) + bg_set = annotated_ids + # st.write(f"βœ… fg_set bg_set are set") + + def run_go(go_type): + go2fg = defaultdict(set) + go2bg = defaultdict(set) + + for _, row in res_go.iterrows(): + uid = str(row["query"]) + for term in row[f"{go_type}_terms"]: + go2bg[term].add(uid) + if uid in fg_set: + go2fg[term].add(uid) + + records = [] + N_fg = len(fg_set) + N_bg = len(bg_set) + + for term, fg_genes in go2fg.items(): + a = len(fg_genes) + if a == 0: + continue + b = N_fg - a + c = len(go2bg[term]) - a + d = N_bg - (a + b + c) + + _, p = fisher_exact([[a, b], [c, d]], alternative="greater") + records.append({ + "GO_Term": term, + "Count": a, + "GeneRatio": f"{a}/{N_fg}", + "p_value": p, + }) + + df = pd.DataFrame(records) + if df.empty: + return None, None + + df["-log10(p)"] = -np.log10(df["p_value"].replace(0, 1e-10)) + df = df.sort_values("p_value").head(20) + + # βœ… Plotly Figure + fig = px.bar( + df, + x="-log10(p)", + y="GO_Term", + orientation="h", + title=f"GO Enrichment ({go_type})", + ) + + # st.write(f"βœ… Plotly Figure generated") + + fig.update_layout( + yaxis=dict(autorange="reversed"), + height=500, + margin=dict(l=10, r=10, t=40, b=10), + ) + + return fig, df + + go_results = {} + + for go_type in ["BP", "CC", "MF"]: + fig, df_go = run_go(go_type) + if fig is not None: + go_results[go_type] = { + "fig": fig, + "df": df_go + } + # st.write(f"βœ… go_type generated") + + go_dir = results_dir / "go-terms" + go_dir.mkdir(parents=True, exist_ok=True) + + import json + go_data = {} + + for go_type in ["BP", "CC", "MF"]: + if go_type in go_results: + fig = go_results[go_type]["fig"] + df = go_results[go_type]["df"] + + go_data[go_type] = { + "fig_json": fig.to_json(), # Figure β†’ JSON string + "df_dict": df.to_dict(orient="records") # DataFrame β†’ list of dicts + } + + go_json_file = go_dir / "go_results.json" + with open(go_json_file, "w") as f: + json.dump(go_data, f) + st.session_state["go_results"] = go_results + st.session_state["go_ready"] = True if go_data else False + # st.write("βœ… GO enrichment analysis complete") + +results_dir = Path(st.session_state["workspace"]) / "topp-workflow" / "results" / "quant" +result = get_abundance_data(st.session_state["workspace"]) +if result is None: + st.info("Abundance data not available. Please run the workflow and configure sample groups first.") + st.page_link("content/results_abundance.py", label="Go to Abundance", icon="πŸ“‹") + st.stop() + +pivot_df, expr_df, group_map = result +_run_go_enrichment(pivot_df, results_dir) + +# ================================ +# Tabs +# ================================ +protein_tab, = st.tabs(["🧬 Protein Table"]) + +# ================================ +# Protein-level results +# ================================ +with protein_tab: + st.markdown("### 🧬 Protein-Level Abundance Table") + st.info( + "This protein-level table is generated by grouping all PSMs that map to the " + "same protein and aggregating their intensities across samples.\n\n" + "Additionally, log2 fold change and p-values are calculated between sample groups." + ) + + if pivot_df.empty: + st.info("No protein-level data available.") + else: + st.session_state["pivot_df"] = pivot_df + st.dataframe(pivot_df.sort_values("p-value"), width="stretch") + +# ====================================================== +# GO Enrichment Results +# ====================================================== +st.markdown("---") +st.subheader("🧬 GO Enrichment Analysis") + +go_json_file = results_dir / "go-terms" / "go_results.json" + +if not go_json_file.exists(): + st.info("GO Enrichment results are not available yet. Please run the analysis first.") +else: + import json + import plotly.io as pio + + with open(go_json_file, "r") as f: + go_data = json.load(f) + + bp_tab, cc_tab, mf_tab = st.tabs([ + "🧬 Biological Process", + "🏠 Cellular Component", + "βš™οΈ Molecular Function", + ]) + + for tab, go_type in zip([bp_tab, cc_tab, mf_tab], ["BP", "CC", "MF"]): + with tab: + if go_type not in go_data: + st.info(f"No enriched {go_type} terms found.") + continue + + fig_json = go_data[go_type]["fig_json"] + df_dict = go_data[go_type]["df_dict"] + + fig = pio.from_json(fig_json) + + df_go = pd.DataFrame(df_dict) + + if df_go.empty: + st.info(f"No enriched {go_type} terms found.") + else: + st.plotly_chart(fig, width="stretch") + + st.markdown(f"#### {go_type} Enrichment Results") + st.dataframe(df_go, width="stretch") \ No newline at end of file diff --git a/content/results_pca.py b/content/results_pca.py new file mode 100644 index 00000000..d15475da --- /dev/null +++ b/content/results_pca.py @@ -0,0 +1,103 @@ +"""PCA Results Page.""" +import streamlit as st +import pandas as pd +import plotly.express as px +from sklearn.decomposition import PCA +from sklearn.preprocessing import StandardScaler +from src.common.common import page_setup +from src.common.results_helpers import get_abundance_data + +params = page_setup() +st.title("PCA Analysis") + +st.markdown( + """ +Principal Component Analysis (PCA) of protein-level abundance. +Samples are colored by group assignment to visualize clustering. +""" +) + +if "workspace" not in st.session_state: + st.warning("Please initialize your workspace first.") + st.stop() + +result = get_abundance_data(st.session_state["workspace"]) +if result is None: + st.info("Abundance data not available. Please run the workflow and configure sample groups first.") + st.page_link("content/results_abundance.py", label="Go to Abundance", icon="πŸ“‹") + st.stop() + +pivot_df, expr_df, group_map = result + +top_n = 500 + +top_proteins = ( + pivot_df + .dropna(subset=["p-adj"]) + .sort_values("p-adj", ascending=True) + .head(top_n)["protein"] +) + +expr_df_pca = expr_df.loc[ + expr_df.index.intersection(top_proteins) +] + +if expr_df_pca.shape[0] < 2: + st.info("Not enough proteins after p-value filtering for PCA.") + st.stop() + +X = expr_df_pca.T +X_scaled = StandardScaler().fit_transform(X) + +pca = PCA(n_components=2) +pcs = pca.fit_transform(X_scaled) + +pca_df = pd.DataFrame( + pcs, + columns=["PC1", "PC2"], + index=X.index +) + +actual_sample_names = pca_df.index.tolist() + +norm_map = {} + +for k, v in group_map.items(): + try: + sample_idx = int(k) + 1 + target_substring = f"sample{sample_idx}[" + real_full_name = next((name for name in actual_sample_names if target_substring in name), None) + + if real_full_name: + norm_map[real_full_name] = v if v and v.strip() else "Unassigned" + except ValueError: + continue + +pca_df["Group"] = pca_df.index.map(norm_map) + +fig_pca = px.scatter( + pca_df, + x="PC1", + y="PC2", + color="Group", + text=pca_df.index, +) + +fig_pca.update_traces(textposition="top center") +fig_pca.update_layout( + xaxis_title=f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)", + yaxis_title=f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)", + height=600, +) + +st.plotly_chart(fig_pca, width="stretch") + +st.markdown(f"**Proteins used:** {expr_df_pca.shape[0]} (top {top_n} by p-adj)") + +st.markdown("---") +st.markdown("**Other visualizations:**") +col1, col2 = st.columns(2) +with col1: + st.page_link("content/results_volcano.py", label="Volcano Plot", icon="πŸŒ‹") +with col2: + st.page_link("content/results_heatmap.py", label="Heatmap", icon="πŸ”₯") diff --git a/content/results_rescoring.py b/content/results_rescoring.py new file mode 100644 index 00000000..ed8ec5f0 --- /dev/null +++ b/content/results_rescoring.py @@ -0,0 +1,80 @@ +"""Rescoring (Percolator) Results Page.""" +import streamlit as st +from pathlib import Path +from src.common.common import page_setup +from src.common.results_helpers import get_workflow_dir +from openms_insight import Table, Heatmap, LinePlot, SequenceView, StateManager + +params = page_setup() +st.title("Rescoring Results") + +st.markdown( + """ +View PSMs after **Percolator** statistical validation. Percolator uses machine learning +to re-score PSMs and estimate false discovery rates (FDR) for more accurate results. +Click on a PSM to view the annotated spectrum and peptide sequence. +""" +) + +st.info( + "**Score:** The Posterior Error Probability (PEP) represents the probability that this PSM " + "is incorrect. Lower values indicate higher confidence identifications." +) + +if "workspace" not in st.session_state: + st.warning("Please initialize your workspace first.") + st.stop() + +workflow_dir = get_workflow_dir(st.session_state["workspace"]) +perc_dir = workflow_dir / "results" / "percolator" +cache_dir = workflow_dir / "results" / "insight_cache" + +if not perc_dir.exists(): + st.info("No rescoring results available yet. Please run the workflow first.") + # st.page_link("content/workflow_run.py", label="Go to Run", icon="πŸš€") + st.stop() + +perc_files = sorted(perc_dir.glob("*.idXML")) + +if not perc_files: + st.warning("No rescoring output files found.") + st.stop() + +selected_file = st.selectbox( + "Select rescoring result file", + perc_files, + format_func=lambda x: x.name +) + +cache_id_prefix = selected_file.stem + +# Check if cache exists +if not (cache_dir / f"table_{cache_id_prefix}").is_dir(): + st.warning("Visualization cache not found. Please re-run the workflow.") + st.stop() + +# Initialize state manager for cross-component linking +state_manager = StateManager() + +# Load components from cache (no data parameter needed) +table = Table(cache_id=f"table_{cache_id_prefix}", cache_path=str(cache_dir)) +heatmap = Heatmap(cache_id=f"heatmap_{cache_id_prefix}", cache_path=str(cache_dir)) +seq_view = SequenceView(cache_id=f"seqview_{cache_id_prefix}", cache_path=str(cache_dir)) +line_plot = LinePlot(cache_id=f"lineplot_{cache_id_prefix}", cache_path=str(cache_dir)) + +# Render components +st.subheader("PSM Overview") +heatmap(state_manager=state_manager, height=350) + +st.subheader("PSM Table") +table(state_manager=state_manager, height=533) + +st.subheader("Peptide Sequence") +seq_view(key=f"seqview_{cache_id_prefix}", state_manager=state_manager, height=533) + +st.subheader("MS2 Spectrum") +line_plot(key=f"lineplot_{cache_id_prefix}", state_manager=state_manager, height=450, sequence_view_key=f"seqview_{cache_id_prefix}") + +st.markdown("---") +st.markdown("**Next step:** View filtered PSMs") +st.page_link("content/results_filtered.py", label="Go to Filtered PSMs", icon="🎯") diff --git a/content/results_volcano.py b/content/results_volcano.py new file mode 100644 index 00000000..17518075 --- /dev/null +++ b/content/results_volcano.py @@ -0,0 +1,109 @@ +"""Volcano Plot Results Page.""" +import streamlit as st +import plotly.express as px +import numpy as np +import pandas as pd +from src.common.common import page_setup +from src.common.results_helpers import get_abundance_data +from statsmodels.stats.multitest import multipletests +from scipy.stats import ttest_ind +from pathlib import Path + +params = page_setup() +st.title("Volcano Plot") + +st.markdown( + """ +Visualize differential expression analysis with a volcano plot. +Points represent proteins colored by significance status. +""" +) + +if "workspace" not in st.session_state: + st.warning("Please initialize your workspace first.") + st.stop() + +result = get_abundance_data(st.session_state["workspace"]) +if result is None: + st.info("Abundance data not available. Please run the workflow and configure sample groups first.") + st.page_link("content/results_abundance.py", label="Go to Abundance", icon="πŸ“‹") + st.stop() + +# Load the processed dataframe from session state +pivot_df, expr_df, group_map = result + +# Threshold Selection UI +st.divider() +c1, c2 = st.columns(2) +with c1: + fc_thresh = st.slider( + "log2 Fold Change threshold", + min_value=0.1, + max_value=3.0, + value=1.0, + step=0.1, + ) +with c2: + p_thresh = st.slider( + "p-adj (FDR) threshold", + min_value=0.001, + max_value=0.1, + value=0.05, + step=0.001, + ) + +volcano_df = pivot_df.dropna(subset=["log2FC", "p-adj"]).copy() +volcano_df["neg_log10_padj"] = -np.log10(volcano_df["p-adj"]) + +volcano_df["Significance"] = "Not significant" +volcano_df.loc[ + (volcano_df["p-adj"] <= p_thresh) & (volcano_df["log2FC"] >= fc_thresh), + "Significance", +] = "Up-regulated" + +volcano_df.loc[ + (volcano_df["p-adj"] <= p_thresh) & (volcano_df["log2FC"] <= -fc_thresh), + "Significance", +] = "Down-regulated" + +fig_volcano = px.scatter( + volcano_df, + x="log2FC", + y="neg_log10_padj", + color="Significance", + hover_data=["protein", "log2FC", "p-value", "p-adj"], + color_discrete_map={ + "Up-regulated": "red", + "Down-regulated": "blue", + "Not significant": "lightgrey", + } +) + +fig_volcano.add_vline(x=fc_thresh, line_dash="dash") +fig_volcano.add_vline(x=-fc_thresh, line_dash="dash") +fig_volcano.add_hline(y=-np.log10(p_thresh), line_dash="dash") + +# Make x-axis symmetric around zero +max_abs_fc = volcano_df["log2FC"].abs().max() +x_range = [-max_abs_fc * 1.1, max_abs_fc * 1.1] # 10% padding + +fig_volcano.update_layout( + xaxis_title="log2 Fold Change", + yaxis_title="-log10(p-adj)", + xaxis_range=x_range, + height=600, +) + +st.plotly_chart(fig_volcano, width="stretch") + +up_count = (volcano_df["Significance"] == "Up-regulated").sum() +down_count = (volcano_df["Significance"] == "Down-regulated").sum() +st.markdown(f"**Up-regulated:** {up_count} | **Down-regulated:** {down_count}") + +st.markdown("---") +st.markdown("**Other visualizations:**") +col1, col2 = st.columns(2) +with col1: + st.page_link("content/results_pca.py", label="PCA", icon="πŸ“Š") +with col2: + st.page_link("content/results_heatmap.py", label="Heatmap", icon="πŸ”₯") diff --git a/src/Workflow.py b/src/Workflow.py index 7523a0cb..2764c8c1 100644 --- a/src/Workflow.py +++ b/src/Workflow.py @@ -1,11 +1,17 @@ +from altair import value import streamlit as st -from src.workflow.WorkflowManager import WorkflowManager +import pandas as pd +import numpy as np +import plotly.express as px +from src.workflow.WorkflowManager import WorkflowManager +from scipy.stats import ttest_ind +from pyopenms import IdXMLFile # for result section: from pathlib import Path -import pandas as pd -import plotly.express as px from src.common.common import show_fig +from openms_insight import Table, Heatmap, LinePlot, SequenceView +from src.common.results_helpers import parse_idxml, build_spectra_cache class Workflow(WorkflowManager): @@ -16,7 +22,7 @@ def __init__(self) -> None: super().__init__("TOPP Workflow", st.session_state["workspace"]) def upload(self) -> None: - t = st.tabs(["MS data"]) + t = st.tabs(["MS data", "FASTA database"]) with t[0]: # Use the upload method from StreamlitUI to handle mzML file uploads. self.ui.upload_widget( @@ -26,98 +32,1008 @@ def upload(self) -> None: fallback=[str(f) for f in Path("example-data", "mzML").glob("*.mzML")], ) + with t[1]: + self.ui.upload_widget( + key="fasta-file", + name="Protein FASTA database", + file_types=("fasta", "fa"), + fallback=[str(f) for f in Path("example-data", "db").glob("*.fasta")], + ) + @st.fragment def configure(self) -> None: # Allow users to select mzML files for the analysis. self.ui.select_input_file("mzML-files", multiple=True) + self.ui.select_input_file("fasta-file", multiple=False) # Create tabs for different analysis steps. t = st.tabs( - ["**Feature Detection**", "**Feature Linking**", "**Python Custom Tool**"] + ["**IsobaricAnalyzer**", "**CometAdapter**", "**PercolatorAdapter**", "**IDFilter**", "**IDMapper**", "**FileMerger**", + "**ProteinInference**", "**IDFilter**", "**IDConflictResolver**", "**ProteinQuantifier**", "**Group Selection**"] ) with t[0]: - # Parameters for FeatureFinderMetabo TOPP tool. + # Checkbox for decoy generation + # reactive=True ensures the parent configure() fragment re-runs when checkbox changes, + # so conditional UI (DecoyDatabase settings) updates immediately + self.ui.input_widget( + key="generate-decoys", + default=True, + name="Generate Decoy Database", + widget_type="checkbox", + help="Generate reversed decoy sequences for FDR calculation. Disable if your FASTA already contains decoys.", + reactive=True, + ) + + # Reload params to get current checkbox value after it was saved + self.params = self.parameter_manager.get_parameters_from_json() + + # Show DecoyDatabase settings if generating decoys + if self.params.get("generate-decoys", True): + st.info(""" + **Decoy Database Settings:** + * **method**: How decoy sequences are generated from target protein sequences. + *Reverse* creates decoys by reversing each sequence, while *shuffle* randomly + rearranges the amino acids. Both methods preserve the amino acid composition + of the original protein, ensuring decoys have similar properties to real sequences + for accurate false discovery rate (FDR) estimation. + """) + self.ui.input_TOPP( + "DecoyDatabase", + custom_defaults={ + "decoy_string": "rev_", + "decoy_string_position": "prefix", + "method": "reverse", + }, + include_parameters=["method"], + ) + + comet_info = """ + **Identification (Comet):** + * **enzyme**: The enzyme used for peptide digestion. + * **missed_cleavages**: Number of possible cleavage sites missed by the enzyme. It has no effect if enzyme is unspecific cleavage. + * **fixed_modifications**: Fixed modifications, specified using Unimod (www.unimod.org) terms, e.g. 'Carbamidomethyl (C)' or 'Oxidation (M)' + * **variable_modifications**: Variable modifications, specified using Unimod (www.unimod.org) terms, e.g. 'Carbamidomethyl (C)' or 'Oxidation (M)' + * **instrument**: Type of instrument (high_res or low_res). Use 'high_res' for high-resolution MS2 (Orbitrap, TOF), 'low_res' for ion trap. + * **fragment_mass_tolerance**: Fragment mass tolerance for MS2 matching. + * **fragment_bin_offset**: Offset for binning MS2 spectra. Typically 0.0 for high-res, 0.4 for low-res instruments. + """ + if not self.params.get("generate-decoys", True): + comet_info += """* **PeptideIndexing:decoy_string**: String that was appended (or prefixed - see 'decoy_string_position' flag below) to the accessions + in the protein database to indicate decoy proteins. + """ + st.info(comet_info) + + st.write(Path(self.workflow_dir, "results")) + + comet_include = [":enzyme", "missed_cleavages", "fixed_modifications", "variable_modifications", + "instrument", "fragment_mass_tolerance", "fragment_error_units", "fragment_bin_offset"] + if not self.params.get("generate-decoys", True): + # Only show decoy_string when not generating decoys + comet_include.append("PeptideIndexing:decoy_string") + self.ui.input_TOPP( - "FeatureFinderMetabo", - custom_defaults={"algorithm:common:noise_threshold_int": 1000.0}, + "IsobaricAnalyzer", + custom_defaults={ + "tmt11plex:reference_channel": 126, + "type": "tmt11plex", + "extraction:select_activation": "auto", + "extraction:reporter_mass_shift": 0.002, + "extraction:min_reporter_intensity": 0.0, + "extraction:min_precursor_purity": 0.0, + "extraction:precursor_isotope_deviation": 10.0, + "quantification:isotope_correction": "false", + } ) with t[1]: - # Parameters for MetaboliteAdductDecharger TOPP tool. - self.ui.input_TOPP("FeatureLinkerUnlabeledKD") + # Parameters for FeatureFinderMetabo TOPP tool. + # self.ui.input_TOPP( + # "FeatureFinderMetabo", + # custom_defaults={"algorithm:common:noise_threshold_int": 1000.0}, + # ) + comet_include = [":enzyme", "missed_cleavages", "fixed_modifications", "variable_modifications", + "instrument", "fragment_mass_tolerance", "fragment_error_units", "fragment_bin_offset", "PeptideIndexing:IL_equivalent"] + self.ui.input_TOPP( + "CometAdapter", + custom_defaults={ + "PeptideIndexing:IL_equivalent": True, + "clip_nterm_methionine": "true", + "instrument": "high_res", + "missed_cleavages": 2, + "min_peptide_length": 6, + "max_peptide_length": 40, + "enzyme": "Trypsin/P", + "PeptideIndexing:unmatched_action": "warn", + "max_variable_mods_in_peptide": 3, + "precursor_mass_tolerance": 4.5, + "isotope_error": "0/1", + "precursor_error_units": "ppm", + "num_hits": 1, + "num_enzyme_termini": "fully", + "fragment_bin_offset": 0.0, + "minimum_peaks": 10, + "precursor_charge": "2:4", + "fragment_mass_tolerance": 0.015, + "PeptideIndexing:unmatched_action": "warn", + "variable_modifications": "Oxidation (M)\nAcetyl (Protein N-term)\nTMT6plex (K)\nTMT6plex (N-term)", + "debug": 0, + "force": True, + }, + include_parameters=comet_include, + flag_parameters=["PeptideIndexing:IL_equivalent", "force"], + exclude_parameters=["second_enzyme"], + ) with t[2]: + st.info(""" + **Filtering (IDFilter):** + * **score:type_peptide**: Score used for filtering. If empty, the main score is used. + * **score:psm**: The score which should be reached by a peptide hit to be kept. (use 'NAN' to disable this filter) + """) + self.ui.input_TOPP( + "PercolatorAdapter", + custom_defaults={ + "subset_max_train": 300000, + "decoy_pattern": "DECOY_", + "score_type": "pep", + "post_processing_tdc": True, + "debug": 0, + }, + flag_parameters=["post_processing_tdc"], + ) + + with t[3]: + # Parameters for MetaboliteAdductDecharger TOPP tool. + # self.ui.input_TOPP("FeatureLinkerUnlabeledKD") + self.ui.input_TOPP( + "IDFilter", + custom_defaults={ + "score:type_peptide": "q-value", + "score:psm": 0.10, + }, + tool_instance_name="IDFilter-strict", + ) + with t[4]: + st.info(""" + **Quantification (ProteomicsLFQ):** + * **intThreshold**: Peak intensity threshold applied in seed detection. + * **psmFDR**: FDR threshold for sub-protein level (e.g. 0.05=5%). Use -FDR_type to choose the level. Cutoff is applied at the highest level. If Bayesian inference was chosen, it is equivalent with a peptide FDR + * **proteinFDR**: Protein FDR threshold (0.05=5%). + """) + self.ui.input_TOPP( + "IDMapper", + custom_defaults={ + "threads": 8, + "debug": 0, + } + ) + with t[5]: + self.ui.input_TOPP( + "FileMerger", + custom_defaults={ + "in_type": "consensusXML", + "append_method": "append_cols", + "annotate_file_origin": True, + "threads": 8, + }, + flag_parameters=["annotate_file_origin"] + ) + with t[6]: + self.ui.input_TOPP( + "ProteinInference", + custom_defaults={ + "threads": 8, + "picked_decoy_string": "DECOY_", + "picked_fdr": "true", + "protein_fdr": "true", + "Algorithm:use_shared_peptides": "true", + "Algorithm:annotate_indistinguishable_groups": "true", + "Algorithm:score_type": "PEP", + "Algorithm:score_aggregation_method": "best", + "Algorithm:min_peptides_per_protein": 1, + } + ) + with t[7]: # A single checkbox widget for workflow logic. - self.ui.input_widget("run-python-script", False, "Run custom Python script") + # self.ui.input_widget("run-python-script", False, "Run custom Python script") * # Generate input widgets for a custom Python tool, located at src/python-tools. # Parameters are specified within the file in the DEFAULTS dictionary. - self.ui.input_python("example") + # self.ui.input_python("example") * + self.ui.input_TOPP( + "IDFilter", + custom_defaults={ + "score:type_protein": "q-value", + "score:proteingroup": 0.01, + "score:psm": 0.01, + "delete_unreferenced_peptide_hits": True, + "remove_decoys": True + }, + flag_parameters=["delete_unreferenced_peptide_hits", "remove_decoys"], + tool_instance_name="IDFilter-lenient", + ) + with t[8]: + self.ui.input_TOPP( + "IDConflictResolver", + custom_defaults={ + "threads": 4, + } + ) + + with t[9]: + self.ui.input_TOPP( + "ProteinQuantifier", + custom_defaults={ + "method": "top", + "top:N": 3, + "top:aggregate": "median", + "top:include_all": True, + "ratios": True, + "threads": 8, + "debug": 0, + }, + flag_parameters=["top:include_all", "ratios"] + ) + with t[10]: + st.markdown("### πŸ§ͺ TMT Sample Group Assignment") + + # 1. Determine TMT type (e.g., tmt10plex, tmt16plex) + target_key = f"{self.parameter_manager.topp_param_prefix}IsobaricAnalyzer:1:type" + selected_tmt = st.session_state.get(target_key, "tmt12plex") + + if "tmt" in selected_tmt: + import re + # Extract the number to determine the plex count + num_plex_match = re.search(r'\d+', selected_tmt) + if num_plex_match: + num_plex = int(num_plex_match.group()) + all_channels = [f"sample{i+1}" for i in range(num_plex)] + + st.info( + "Enter a group name for each TMT channel.\n\n" + "Type **'skip'** for channels you wish to skip. (e.g., control, case, skip)" + ) + + # 2. Create an input_widget for each channel (automatically saved to params.json) + cols = st.columns(2) + for i, ch in enumerate(all_channels): + with cols[i % 2]: + self.ui.input_widget( + key=f"TMT-group-{ch}", + default="", + name=f"Group for {ch}", + widget_type="text", + help="Enter group name or 'skip' to ignore this channel.", + ) + + # 3. Read values from params.json and construct a dictionary in tmt_group_map format + # (This can be used later to filter DataFrames in subsequent logic) + self.params = self.parameter_manager.get_parameters_from_json() + + tmt_group_map = {} + for i, ch in enumerate(all_channels): + # Retrieve stored value (default is empty string) + group_val = self.params.get(f"TMT-group-{ch}", "") + tmt_group_map[str(i)] = group_val + + # For data inspection (remove if not needed) + if st.checkbox("Show current TMT mapping"): + st.json(tmt_group_map) + + # 4. Clean up parameters from unused/previous TMT settings + all_possible_channel_keys = {f"TMT-group-{ch}" for ch in all_channels} + orphaned_keys = [ + k for k in self.params.keys() + if k.startswith("TMT-group-") and k not in all_possible_channel_keys + ] + + if orphaned_keys: + for key in orphaned_keys: + del self.params[key] + self.parameter_manager.save_parameters() + + else: + st.warning("Please select a TMT type in the parameters first.") + # with t[10]: + # st.markdown("### πŸ§ͺ Sample Group Assignment") + # target_key = f"{self.parameter_manager.topp_param_prefix}IsobaricAnalyzer:1:type" + # selected_tmt = st.session_state.get(target_key, "tmt12plex") + + # if "tmt" in selected_tmt: + # import re + # num_plex = int(re.search(r'\d+', selected_tmt).group()) + # all_channels = [f"sample{i+1}" for i in range(num_plex)] + + # @st.fragment + # def render_group_assignment(): + # exclude_channels = st.multiselect( + # "Select samples to exclude.", + # options=all_channels, + # key="exclude_selector", + # help="Samples selected here will be excluded from the dataframe." + # ) + # st.write("exclude_channels:", exclude_channels) + + # exclude_indices = [i for i, ch in enumerate(all_channels) if ch in exclude_channels] + + # st.write("exclude_indices:", exclude_indices) + # st.session_state["tmt_exclude_indices"] = exclude_indices + + # keep_channels = [ch for ch in all_channels if ch not in exclude_channels] + + # st.info("Enter group names for remaining samples.") + # group_mapping = {} + + # if keep_channels: + # cols = st.columns(2) + # for idx, ch in enumerate(keep_channels): + # original_idx = all_channels.index(ch) + # with cols[idx % 2]: + # group_name = st.text_input( + # f"Group for {ch}", + # value=st.session_state.get("tmt_group_map", {}).get(original_idx, ""), + # placeholder="e.g. Control or Case", + # key=f"input_{ch}_{len(keep_channels)}" + # ) + # group_mapping[original_idx] = group_name + # else: + # st.warning("All samples were selected for exclusion.") + + # st.session_state["tmt_group_map"] = group_mapping + + # if st.checkbox("Saved session data check"): + # st.write("Indices to exclude:", st.session_state["tmt_exclude_indices"]) + # st.write("Groups for remaining indices:", st.session_state["tmt_group_map"]) + + # render_group_assignment() + + # else: + # st.warning("Please select a TMT type in the parameters first.") + def execution(self) -> None: # Any parameter checks, here simply checking if mzML files are selected if not self.params["mzML-files"]: self.logger.log("ERROR: No mzML files selected.") return + + if not self.params.get("fasta-file"): + st.error("No FASTA file selected.") + return False # Get mzML files with FileManager in_mzML = self.file_manager.get_files(self.params["mzML-files"]) + fasta_file = self.file_manager.get_files([self.params["fasta-file"]])[0] + + if len(in_mzML) < 1: + st.error("At least one mzML file is required.") + return False + + fasta_path = Path(fasta_file) + + self.logger.log(f"πŸ“‚ Loaded {len(in_mzML)} sample(s)") + + if self.params.get("generate-decoys", True): + decoy_fasta = fasta_path.with_suffix(".decoy.fasta") + # Get decoy_string from DecoyDatabase params + decoy_string = self.params.get("DecoyDatabase", {}).get("decoy_string", "rev_") + + if not decoy_fasta.exists(): + self.logger.log("🧬 Generating decoy database...") + st.info("Generating decoy FASTA database...") + if not self.executor.run_topp( + "DecoyDatabase", + {"in": [str(fasta_path)], "out": [str(decoy_fasta)]}, + ): + self.logger.log("Workflow stopped due to error") + return False + self.logger.log("βœ… Decoy database ready") + st.success(f"Using decoy FASTA: {decoy_fasta.name}") + database_fasta = decoy_fasta + else: + # Get decoy_string from CometAdapter params + decoy_string = self.params.get("CometAdapter", {}).get("PeptideIndexing:decoy_string", "rev_") + self.logger.log("πŸ“„ Using existing FASTA database") + st.info(f"Using original FASTA: {fasta_path.name}") + database_fasta = fasta_path # Log any messages. self.logger.log(f"Number of input mzML files: {len(in_mzML)}") - # Prepare output files for feature detection. - out_ffm = self.file_manager.get_files( - in_mzML, "featureXML", "feature-detection" - ) + results_dir = Path(self.workflow_dir, "results") + iso_dir = results_dir / "isobaric_consensusXML" + comet_dir = results_dir / "comet_results" + perc_dir = results_dir / "percolator" + psm_filter_dir = results_dir / "psm_filter" + map_dir = results_dir / "idmapper" + merge_dir = results_dir / "merged" + protein_dir = results_dir / "protein" + msstats_dir = results_dir / "msstats" + quant_dir = results_dir / "quant_results" - # Run FeatureFinderMetabo tool with input and output files. - self.logger.log("Detecting features...") - self.executor.run_topp( - "FeatureFinderMetabo", input_output={"in": in_mzML, "out": out_ffm} - ) + iso_consensus = [] + comet_results = [] + percolator_results = [] + psm_filtered = [] + mapped_ids = [] - # Prepare input and output files for feature linking - in_fl = self.file_manager.get_files(out_ffm, collect=True) - out_fl = self.file_manager.get_files( - "feature_matrix.consensusXML", set_results_dir="feature-linking" - ) + for d in [ + iso_dir, comet_dir, perc_dir, psm_filter_dir, + map_dir, merge_dir, protein_dir, msstats_dir, quant_dir + ]: + d.mkdir(parents=True, exist_ok=True) - # Run FeatureLinkerUnlabaeledKD with all feature maps passed at once - self.logger.log("Linking features...") - self.executor.run_topp( - "FeatureLinkerUnlabeledKD", input_output={"in": in_fl, "out": out_fl} - ) - self.logger.log("Exporting consensus features to pandas DataFrame...") - self.executor.run_python( - "export_consensus_feature_df", input_output={"in": out_fl[0]} - ) - # Check if adduct detection should be run. - if self.params["run-python-script"]: - # Example for a custom Python tool, which is located in src/python-tools. - self.executor.run_python("example", {"in": in_mzML}) + for mz in in_mzML: + stem = Path(mz).stem + iso_consensus.append(str(iso_dir / f"{stem}_iso.consensusXML")) + comet_results.append(str(comet_dir / f"{stem}_comet.idXML")) + percolator_results.append(str(perc_dir / f"{stem}_comet_perc.idXML")) + psm_filtered.append(str(psm_filter_dir / f"{stem}_comet_perc_filter.idXML")) + mapped_ids.append(str(map_dir / f"{stem}_comet_perc_filter_map.consensusXML")) + + merged_id = str(merge_dir / "ID_mapper_merge.consensusXML") + protein_id = str(protein_dir / "ID_mapper_merge_epi.consensusXML") + protein_filter = str(protein_dir / "ID_mapper_merge_epi_filter.consensusXML") + protein_resolved = str(protein_dir / "ID_mapper_merge_epi_filter_resconf.consensusXML") + # msstats_input = str(msstats_dir / "msstats_input.csv") + consensus_out = str(quant_dir / "openms_design_protein_openms.csv") + + # --- IsobaricAnalyzer --- + self.logger.log("🏷️ Running isobaric labeling analysis...") + with st.spinner("IsobaricAnalyzer"): + if not self.executor.run_topp( + "IsobaricAnalyzer", + { + "in": in_mzML, + "out": iso_consensus, + }, + ): + self.logger.log("Workflow stopped due to error") + return False + self.logger.log("βœ… IsobaricAnalyzer complete") + + # --- CometAdapter --- + self.logger.log("πŸ”Ž Running peptide search...") + with st.spinner(f"CometAdapter ({stem})"): + comet_extra_params = {"database": str(database_fasta)} + if self.params.get("generate-decoys", True): + # Propagate decoy_string from DecoyDatabase + comet_extra_params["PeptideIndexing:decoy_string"] = decoy_string + if not self.executor.run_topp( + "CometAdapter", + { + "in": in_mzML, + "out": comet_results, + }, + comet_extra_params, + ): + self.logger.log("Workflow stopped due to error") + return False + self.logger.log("βœ… CometAdapter complete") + + # Get fragment tolerance from CometAdapter parameters for visualization + comet_params = self.parameter_manager.get_topp_parameters("CometAdapter") + frag_tol = comet_params.get("fragment_mass_tolerance", 0.02) + frag_tol_is_ppm = comet_params.get("fragment_error_units", "Da") != "Da" + + # Build visualization cache for Comet results + results_dir_path = Path(self.workflow_dir, "results") + cache_dir = results_dir_path / "insight_cache" + cache_dir.mkdir(parents=True, exist_ok=True) + + # Get mzML directory + mzml_dir = Path(in_mzML[0]).parent + + # Build spectra cache (once, shared by all stages) + spectra_df = None + filename_to_index = {} + + for idxml_file in comet_results: + idxml_path = Path(idxml_file) + cache_id_prefix = idxml_path.stem + + # Parse idXML to DataFrame + id_df, spectra_data = parse_idxml(idxml_path) + + # Build spectra cache (only once) + if spectra_df is None: + filename_to_index = {Path(f).name: i for i, f in enumerate(spectra_data)} + spectra_df, filename_to_index = build_spectra_cache(mzml_dir, filename_to_index) + + # Initialize Table component (caches itself) + Table( + cache_id=f"table_{cache_id_prefix}", + data=id_df.lazy(), + cache_path=str(cache_dir), + interactivity={"file": "file_index", "spectrum": "scan_id", "identification": "id_idx"}, + column_definitions=[ + {"field": "sequence", "title": "Sequence"}, + {"field": "charge", "title": "Z", "sorter": "number"}, + {"field": "mz", "title": "m/z", "sorter": "number"}, + {"field": "rt", "title": "RT", "sorter": "number"}, + {"field": "score", "title": "Score", "sorter": "number"}, + {"field": "protein_accession", "title": "Proteins"}, + ], + initial_sort=[{"column": "score", "dir": "asc"}], + index_field="id_idx", + ) + + # Initialize Heatmap component + Heatmap( + cache_id=f"heatmap_{cache_id_prefix}", + data=id_df.lazy(), + cache_path=str(cache_dir), + x_column="rt", + y_column="mz", + intensity_column="score", + interactivity={"identification": "id_idx"}, + ) + + # Initialize SequenceView component + seq_view = SequenceView( + cache_id=f"seqview_{cache_id_prefix}", + sequence_data=id_df.lazy().select(["id_idx", "sequence", "charge", "file_index", "scan_id"]).rename({ + "id_idx": "sequence_id", + "charge": "precursor_charge", + }), + peaks_data=spectra_df.lazy(), + filters={ + "identification": "sequence_id", + "file": "file_index", + "spectrum": "scan_id", + }, + interactivity={"peak": "peak_id"}, + cache_path=str(cache_dir), + deconvolved=False, + annotation_config={ + "ion_types": ["b", "y"], + "neutral_losses": True, + "tolerance": frag_tol, + "tolerance_ppm": frag_tol_is_ppm, + }, + ) + + # Initialize LinePlot from SequenceView + LinePlot.from_sequence_view( + seq_view, + cache_id=f"lineplot_{cache_id_prefix}", + cache_path=str(cache_dir), + title="Annotated Spectrum", + styling={ + "unhighlightedColor": "#CCCCCC", + "highlightColor": "#E74C3C", + "selectedColor": "#F3A712", + }, + ) + + self.logger.log("βœ… Peptide search complete") + + # --- PercolatorAdapter --- + self.logger.log("πŸ“Š Running rescoring...") + with st.spinner(f"PercolatorAdapter"): + if not self.executor.run_topp( + "PercolatorAdapter", + { + "in": comet_results, + "out": percolator_results, + }, + ): + self.logger.log("Workflow stopped due to error") + return False + # Build visualization cache for Percolator results + for idxml_file in percolator_results: + idxml_path = Path(idxml_file) + cache_id_prefix = idxml_path.stem + + # Parse idXML to DataFrame + id_df, spectra_data = parse_idxml(idxml_path) + + # Initialize Table component (caches itself) + Table( + cache_id=f"table_{cache_id_prefix}", + data=id_df.lazy(), + cache_path=str(cache_dir), + interactivity={"file": "file_index", "spectrum": "scan_id", "identification": "id_idx"}, + column_definitions=[ + {"field": "sequence", "title": "Sequence"}, + {"field": "charge", "title": "Z", "sorter": "number"}, + {"field": "mz", "title": "m/z", "sorter": "number"}, + {"field": "rt", "title": "RT", "sorter": "number"}, + {"field": "score", "title": "Score", "sorter": "number"}, + {"field": "protein_accession", "title": "Proteins"}, + ], + initial_sort=[{"column": "score", "dir": "asc"}], + index_field="id_idx", + ) + + # Initialize Heatmap component + Heatmap( + cache_id=f"heatmap_{cache_id_prefix}", + data=id_df.lazy(), + cache_path=str(cache_dir), + x_column="rt", + y_column="mz", + intensity_column="score", + interactivity={"identification": "id_idx"}, + ) + + # Initialize SequenceView component + seq_view = SequenceView( + cache_id=f"seqview_{cache_id_prefix}", + sequence_data=id_df.lazy().select(["id_idx", "sequence", "charge", "file_index", "scan_id"]).rename({ + "id_idx": "sequence_id", + "charge": "precursor_charge", + }), + peaks_data=spectra_df.lazy(), + filters={ + "identification": "sequence_id", + "file": "file_index", + "spectrum": "scan_id", + }, + interactivity={"peak": "peak_id"}, + cache_path=str(cache_dir), + deconvolved=False, + annotation_config={ + "ion_types": ["b", "y"], + "neutral_losses": True, + "tolerance": frag_tol, + "tolerance_ppm": frag_tol_is_ppm, + }, + ) + + # Initialize LinePlot from SequenceView + LinePlot.from_sequence_view( + seq_view, + cache_id=f"lineplot_{cache_id_prefix}", + cache_path=str(cache_dir), + title="Annotated Spectrum", + styling={ + "unhighlightedColor": "#CCCCCC", + "highlightColor": "#E74C3C", + "selectedColor": "#F3A712", + }, + ) + + self.logger.log("βœ… PercolatorAdapter complete") + + # --- IDFilter --- + self.logger.log("πŸ”§ Filtering identifications...") + with st.spinner(f"IDFilter"): + if not self.executor.run_topp( + "IDFilter", + { + "in": percolator_results, + "out": psm_filtered, + }, + tool_instance_name="IDFilter-strict" + ): + self.logger.log("Workflow stopped due to error") + return False + self.logger.log("βœ… IDFilter-strict complete") + + # Build visualization cache for Filter results + for idxml_file in psm_filtered: + idxml_path = Path(idxml_file) + cache_id_prefix = idxml_path.stem + + # Parse idXML to DataFrame + id_df, spectra_data = parse_idxml(idxml_path) + + # Initialize Table component (caches itself) + Table( + cache_id=f"table_{cache_id_prefix}", + data=id_df.lazy(), + cache_path=str(cache_dir), + interactivity={"file": "file_index", "spectrum": "scan_id", "identification": "id_idx"}, + column_definitions=[ + {"field": "sequence", "title": "Sequence"}, + {"field": "charge", "title": "Z", "sorter": "number"}, + {"field": "mz", "title": "m/z", "sorter": "number"}, + {"field": "rt", "title": "RT", "sorter": "number"}, + {"field": "score", "title": "Score", "sorter": "number"}, + {"field": "protein_accession", "title": "Proteins"}, + ], + initial_sort=[{"column": "score", "dir": "asc"}], + index_field="id_idx", + ) + + # Initialize Heatmap component + Heatmap( + cache_id=f"heatmap_{cache_id_prefix}", + data=id_df.lazy(), + cache_path=str(cache_dir), + x_column="rt", + y_column="mz", + intensity_column="score", + interactivity={"identification": "id_idx"}, + ) + + # Initialize SequenceView component + seq_view = SequenceView( + cache_id=f"seqview_{cache_id_prefix}", + sequence_data=id_df.lazy().select(["id_idx", "sequence", "charge", "file_index", "scan_id"]).rename({ + "id_idx": "sequence_id", + "charge": "precursor_charge", + }), + peaks_data=spectra_df.lazy(), + filters={ + "identification": "sequence_id", + "file": "file_index", + "spectrum": "scan_id", + }, + interactivity={"peak": "peak_id"}, + cache_path=str(cache_dir), + deconvolved=False, + annotation_config={ + "ion_types": ["b", "y"], + "neutral_losses": True, + "tolerance": frag_tol, + "tolerance_ppm": frag_tol_is_ppm, + }, + ) + + # Initialize LinePlot from SequenceView + LinePlot.from_sequence_view( + seq_view, + cache_id=f"lineplot_{cache_id_prefix}", + cache_path=str(cache_dir), + title="Annotated Spectrum", + styling={ + "unhighlightedColor": "#CCCCCC", + "highlightColor": "#E74C3C", + "selectedColor": "#F3A712", + }, + ) + + # --- IDMapper --- + self.logger.log("πŸ—ΊοΈ Mapping IDs to isobaric consensus features...") + for iso, psm, mapped in zip(iso_consensus, psm_filtered, mapped_ids): + iso_stem = Path(iso).stem + with st.spinner(f"IDMapper ({iso_stem})"): + if not self.executor.run_topp( + "IDMapper", + { + "in": [iso], + "id": [psm], + "out": [mapped], + }, + ): + self.logger.log("Workflow stopped due to error") + return False + self.logger.log("βœ… IDMapper complete") + + # --- FileMerger --- + self.logger.log("πŸ”— Merging mapped consensus files...") + with st.spinner("FileMerger"): + if not self.executor.run_topp( + "FileMerger", + { + "in": mapped_ids, + "out": [merged_id], + }, + ): + self.logger.log("Workflow stopped due to error") + return False + self.logger.log("βœ… FileMerger complete") + + # --- ProteinInference --- + self.logger.log("🧩 Running protein inference...") + with st.spinner("ProteinInference"): + if not self.executor.run_topp( + "ProteinInference", + { + "in": [merged_id], + "out": [protein_id], + }, + ): + self.logger.log("Workflow stopped due to error") + return False + self.logger.log("βœ… ProteinInference complete") + + # --- IDFilter-lenient (Protein) --- + self.logger.log("πŸ”¬ Filtering proteins...") + with st.spinner("IDFilter (Protein)"): + if not self.executor.run_topp( + "IDFilter", + { + "in": [protein_id], + "out": [protein_filter], + }, + tool_instance_name="IDFilter-lenient" + ): + self.logger.log("Workflow stopped due to error") + return False + self.logger.log("βœ… IDFilter-lenient (Protein) complete") + + # ================================ + # ✨ NEW: 8️⃣ IDConflictResolver (protein_filter β†’ protein_resolved) + # ================================ + self.logger.log("βš–οΈ Resolving ID conflicts...") + with st.spinner("IDConflictResolver"): + if not self.executor.run_topp( + "IDConflictResolver", + { + "in": [protein_filter], + "out": [protein_resolved], + }, + ): + self.logger.log("Workflow stopped due to error") + return False + self.logger.log("βœ… IDConflictResolver complete") + + # ================================ + # ✨ NEW: πŸ”Ÿ ProteinQuantifier (protein_resolved β†’ consensus_out) + # ================================ + self.logger.log("πŸ“ Running protein quantification...") + with st.spinner("ProteinQuantifier"): + if not self.executor.run_topp( + "ProteinQuantifier", + { + "in": [protein_resolved], + "out": [consensus_out], + }, + ): + self.logger.log("Workflow stopped due to error") + return False + self.logger.log("βœ… ProteinQuantifier complete") + self.logger.log("πŸŽ‰ WORKFLOW FINISHED") @st.fragment def results(self) -> None: - @st.fragment - def show_consensus_features(): - df = pd.read_csv(file, sep="\t", index_col=0) - st.metric("number of consensus features", df.shape[0]) - c1, c2 = st.columns(2) - rows = c1.dataframe(df, selection_mode="multi-row", on_select="rerun")[ - "selection" - ]["rows"] - if rows: - df = df.iloc[rows, 4:] - fig = px.bar(df, barmode="group", labels={"value": "intensity"}) - with c2: - show_fig(fig, "consensus-feature-intensities") + st.title("πŸ“Š DDA-TMT Analysis Results") + + # Tab configuration (TMT-specific) + tabs = st.tabs([ + "πŸ” Identification", + "πŸ” Rescoring & Filter", + "πŸ“Š TMT Reporter Intensity", + "🧬 Protein Grouping", + "πŸŒ‹ Statistical Analysis" + ]) + + id_tab, filter_tab, tmt_tab, prot_tab, stat_tab = tabs + + # Helper: idXML to DataFrame (Maintain existing code) + def idxml_to_df(idxml_file): + proteins, peptides = [], [] + IdXMLFile().load(str(idxml_file), proteins, peptides) + records = [] + for pep in peptides: + rt, mz = pep.getRT(), pep.getMZ() + for h in pep.getHits(): + records.append({ + "RT": rt, "m/z": mz, "Sequence": h.getSequence().toString(), + "Charge": h.getCharge(), "Score": h.getScore(), + "Proteins": ",".join([ev.getProteinAccession() for ev in h.getPeptideEvidences()]) + }) + return pd.DataFrame(records) + + # 1. Identification (Comet) + with id_tab: + comet_files = sorted(Path(self.workflow_dir, "results", "comet").glob("*.idXML")) + if comet_files: + selected_file = st.selectbox("Select Identification file", comet_files, key="comet_sb") + df_comet = idxml_to_df(selected_file) + st.dataframe(df_comet, use_container_width=True) + # Scatter plot code can remain the same + + # 2. Filtering (Percolator + IDFilter) + with filter_tab: + filter_files = sorted(Path(self.workflow_dir, "results", "psm_filter").glob("*.idXML")) + if filter_files: + selected_f = st.selectbox("Select Filtered file", filter_files, key="filter_sb") + df_filter = idxml_to_df(selected_f) + st.success(f"Identified {len(df_filter)} PSMs after filtering (FDR < 0.01)") + st.dataframe(df_filter, use_container_width=True) + + # 3. TMT Reporter Intensity (IsobaricAnalyzer) + with tmt_tab: + st.subheader("TMT Reporter Ion Intensity Distribution") + # Extract channel-specific intensities from IsobaricAnalyzer consensusXML + iso_files = sorted(Path(self.workflow_dir, "results", "isobaric_consensusXML").glob("*.consensusXML")) + if iso_files: + sel_iso = st.selectbox("Select TMT result", iso_files) + # For simple visualization, it is better to use the quantitative results CSV if available + # Example distribution using the final CSV (openms_design_protein_openms.csv) + quant_file = Path(self.workflow_dir, "results", "quant", "openms_design_protein_openms.csv") + if quant_file.exists(): + df_q = pd.read_csv(quant_file) + # Filter channel columns (usually prefixed with 'Abundance_' or specific tags) + # Column names need to be verified based on the OpenMS output structure + intensity_cols = [c for c in df_q.columns if 'intensity' in c.lower() or 'abundance' in c.lower()] + if intensity_cols: + fig_box = px.box(df_q.melt(value_vars=intensity_cols), x="variable", y="value", log_y=True, + title="Log-scaled Reporter Intensity Distribution per Channel") + st.plotly_chart(fig_box) + + # 4. Protein Grouping & Quantification + with prot_tab: + st.subheader("🧬 Final Protein-Level Results") + final_csv = Path(self.workflow_dir, "results", "quant", "openms_design_protein_openms.csv") + + if final_csv.exists(): + df_final = pd.read_csv(final_csv) + st.info(f"Total Protein Groups: {df_final['ProteinName'].nunique() if 'ProteinName' in df_final.columns else len(df_final)}") + st.dataframe(df_final, use_container_width=True) + + # CSV Download Button + st.download_button("Download Results", df_final.to_csv(index=False), "TMT_Results.csv") else: - st.info( - "πŸ’‘ Select one ore more rows in the table to show a barplot with intensities." - ) + st.warning("Final quantification CSV not found.") - file = Path( - self.workflow_dir, "results", "feature-linking", "feature_matrix.tsv" - ) - if file.exists(): - show_consensus_features() - else: - st.warning("No consensus feature file found. Please run workflow first.") + # 5. Statistical Analysis (Volcano Plot etc.) + with stat_tab: + final_csv = Path(self.workflow_dir, "results", "quant", "openms_design_protein_openms.csv") + + if not final_csv.exists(): + st.warning("Analysis results not found. Please run the workflow first.") + return + + try: + # 1️⃣ Data loading and preprocessing + df_quant = pd.read_csv(final_csv) + + # Identify intensity (abundance) columns from TMT results + # Typically starts with 'abundance_' or 'intensity_' depending on OpenMS output format + intensity_cols = [c for c in df_quant.columns if 'abundance' in c.lower() or 'intensity' in c.lower()] + + if len(intensity_cols) < 2: + st.error("Not enough intensity columns found for comparison.") + return + + # 2️⃣ Group setup (use existing source's group_map) + # In TMT, multiple channels (samples) exist within a single file, + # so map which column belongs to which group (Control/Case). + st.subheader("Group Comparison Setup") + col1, col2 = st.columns(2) + with col1: + group_a_cols = st.multiselect("Select Control Group Channels", intensity_cols, default=[intensity_cols[0]]) + with col2: + group_b_cols = st.multiselect("Select Case Group Channels", intensity_cols, default=[intensity_cols[-1]]) + + if st.button("Run Statistical Analysis"): + stats_results = [] + + for _, row in df_quant.iterrows(): + g1 = row[group_a_cols].values.astype(float) + g2 = row[group_b_cols].values.astype(float) + + # Calculate Log2 Fold Change + log2fc = np.log2(np.mean(g2) / np.mean(g1)) if np.mean(g1) > 0 else 0 + + # T-test (p-value) + _, pval = ttest_ind(g1, g2, nan_policy='omit') + + stats_results.append({ + "Protein": row.get("ProteinName", "Unknown"), + "log2FC": log2fc, + "pvalue": pval, + "-log10_pvalue": -np.log10(pval) if pval > 0 else 0 + }) + + df_stats = pd.DataFrame(stats_results) + + # 3️⃣ Volcano plot visualization + st.divider() + st.subheader("Volcano Plot") + + # Define colors to highlight significant proteins + df_stats['Significance'] = 'Normal' + df_stats.loc[(df_stats['log2FC'] > 1) & (df_stats['pvalue'] < 0.05), 'Significance'] = 'Up' + df_stats.loc[(df_stats['log2FC'] < -1) & (df_stats['pvalue'] < 0.05), 'Significance'] = 'Down' + + fig_volcano = px.scatter( + df_stats, x="log2FC", y="-log10_pvalue", + color="Significance", + hover_data=["Protein"], + color_discrete_map={'Up': 'red', 'Down': 'blue', 'Normal': 'gray'}, + title=f"Comparison: {', '.join(group_b_cols)} vs {', '.join(group_a_cols)}" + ) + + # Guidelines (p=0.05, FC=2) + fig_volcano.add_hline(y=-np.log10(0.05), line_dash="dash", line_color="black") + fig_volcano.add_vline(x=1, line_dash="dash", line_color="black") + fig_volcano.add_vline(x=-1, line_dash="dash", line_color="black") + + st.plotly_chart(fig_volcano, use_container_width=True) + st.dataframe(df_stats.sort_values("pvalue"), use_container_width=True) + + except Exception as e: + st.error(f"Error during statistical analysis: {e}") \ No newline at end of file diff --git a/src/common/results_helpers.py b/src/common/results_helpers.py new file mode 100644 index 00000000..8ff3110d --- /dev/null +++ b/src/common/results_helpers.py @@ -0,0 +1,483 @@ +"""Helper functions for results pages.""" +import re +import pandas as pd +import polars as pl +import numpy as np +import streamlit as st +from pathlib import Path +from scipy.stats import ttest_ind +from pyopenms import IdXMLFile, MSExperiment, MzMLFile +from src.workflow.ParameterManager import ParameterManager +from statsmodels.stats.multitest import multipletests + +def get_workflow_dir(workspace): + """Get the workflow directory path.""" + return Path(workspace, "topp-workflow") + + +def idxml_to_df(idxml_file): + """Parse idXML file and return DataFrame with peptide hits.""" + proteins = [] + peptides = [] + IdXMLFile().load(str(idxml_file), proteins, peptides) + + records = [] + for pep in peptides: + rt = pep.getRT() + mz = pep.getMZ() + for h in pep.getHits(): + protein_refs = [ev.getProteinAccession() for ev in h.getPeptideEvidences()] + records.append({ + "RT": rt, + "m/z": mz, + "Sequence": h.getSequence().toString(), + "Charge": h.getCharge(), + "Score": h.getScore(), + "Proteins": ",".join(protein_refs) if protein_refs else None, + }) + + df = pd.DataFrame(records) + if not df.empty: + df["Charge"] = df["Charge"].astype(str) + df["Charge_num"] = df["Charge"].astype(int) + return df + + +def create_psm_scatter_plot(df_plot): + """Create a scatter plot for PSM visualization.""" + import plotly.express as px + + fig = px.scatter( + df_plot, + x="RT", + y="m/z", + color="Score", + custom_data=["index", "Sequence", "Proteins"], + color_continuous_scale=["#a6cee3", "#1f78b4", "#08519c", "#08306b"], + ) + fig.update_traces( + marker=dict(size=6, opacity=0.8), + hovertemplate='Index: %{customdata[0]}
' + + 'RT: %{x:.2f}
' + + 'm/z: %{y:.4f}
' + + 'Score: %{marker.color:.3f}
' + + 'Sequence: %{customdata[1]}
' + + 'Proteins: %{customdata[2]}
' + + '' + ) + fig.update_layout( + coloraxis_colorbar=dict(title="Score"), + hovermode="closest" + ) + return fig + + +def extract_scan_from_ref(spec_ref: str) -> int: + """Extract scan number from spectrum reference string. + + Format: "controllerType=0 controllerNumber=1 scan=1234" + """ + match = re.search(r'scan=(\d+)', spec_ref) + return int(match.group(1)) if match else 0 + + +def extract_scan_number(native_id: str) -> int: + """Extract scan number from native ID.""" + match = re.search(r'scan=(\d+)', native_id) + return int(match.group(1)) if match else 0 + + +def extract_filename_from_idxml(idxml_path: Path) -> str: + """Derive mzML filename from idXML filename.""" + stem = idxml_path.stem + for suffix in ['_comet', '_per', '_filter']: + stem = stem.replace(suffix, '') + return f"{stem}.mzML" + + +def parse_idxml(idxml_path: Path) -> tuple[pl.DataFrame, list[str]]: + """Parse idXML and return DataFrame for openms_insight. + + Returns: + Tuple of (id_df, spectra_data list of source filenames) + """ + proteins = [] + peptides = [] + IdXMLFile().load(str(idxml_path), proteins, peptides) + + # Derive mzML filename from idXML filename (e.g., 02COVID_filter.idXML -> 02COVID.mzML) + spectra_data = [extract_filename_from_idxml(idxml_path)] + + # Build filename to index mapping + filename_to_index = {Path(f).name: i for i, f in enumerate(spectra_data)} + + records = [] + for pep in peptides: + # Get spectrum reference from meta value (key may be bytes or string) + spec_ref = "" + if pep.metaValueExists("spectrum_reference"): + spec_ref = pep.getMetaValue("spectrum_reference") + if isinstance(spec_ref, bytes): + spec_ref = spec_ref.decode() + scan_id = extract_scan_from_ref(spec_ref) + + # Get file index from id_merge_index or derive from filename + file_index = pep.getMetaValue("id_merge_index") if pep.metaValueExists("id_merge_index") else 0 + filename = spectra_data[file_index] if file_index < len(spectra_data) else "" + + for h in pep.getHits(): + records.append({ + "id_idx": len(records), + "scan_id": scan_id, + "file_index": file_index, + "filename": Path(filename).name if filename else "", + "sequence": h.getSequence().toString(), + "charge": h.getCharge(), + "mz": pep.getMZ(), + "rt": pep.getRT(), + "score": h.getScore(), + "protein_accession": ";".join([ev.getProteinAccession() for ev in h.getPeptideEvidences()]), + }) + + return pl.DataFrame(records), spectra_data + + +def build_spectra_cache(mzml_dir: Path, filename_to_index: dict) -> tuple[pl.DataFrame, dict]: + """Extract MS2 spectra from mzML files and return DataFrame. + + Args: + mzml_dir: Directory containing mzML files + filename_to_index: Dict mapping filename to file_index + + Returns: + Tuple of (spectra_df, updated filename_to_index) + """ + records = [] + peak_id = 0 + + for mzml_path in sorted(mzml_dir.glob("*.mzML")): + # Get or create file index + if mzml_path.name not in filename_to_index: + filename_to_index[mzml_path.name] = len(filename_to_index) + file_index = filename_to_index[mzml_path.name] + + exp = MSExperiment() + MzMLFile().load(str(mzml_path), exp) + + for spec in exp: + if spec.getMSLevel() != 2: + continue + scan_id = extract_scan_number(spec.getNativeID()) + mz_array, int_array = spec.get_peaks() + + for mz, intensity in zip(mz_array, int_array): + records.append({ + "peak_id": peak_id, + "file_index": file_index, + "scan_id": scan_id, + "mass": float(mz), # Use "mass" not "mz" + "intensity": float(intensity), + }) + peak_id += 1 + + return pl.DataFrame(records), filename_to_index + + +@st.cache_data +def load_abundance_data(workspace_path: str, csv_mtime: float) -> tuple | None: + """Load CSV, compute stats (log2FC, p-value), build pivot_df and expr_df. + + Args: + workspace_path: Path to the workspace directory + csv_mtime: Modification time of CSV file (used as cache key) + + Returns: + Tuple of (pivot_df, expr_df, group_map) or None if data unavailable + """ + workflow_dir = get_workflow_dir(Path(workspace_path)) + quant_dir = workflow_dir / "results" / "quant_results" + + if not quant_dir.exists(): + return None + + csv_files = sorted(quant_dir.glob("*.csv")) + if not csv_files: + return None + + csv_file = csv_files[0] + + try: + df = pd.read_csv(csv_file, sep="\t", comment="#", engine="python") + except Exception: + return None + + st.write(f"Loaded quantification data from {csv_file.name} (mtime: {csv_mtime})") + + if df.empty: + return None + + # ratio column removal + df = df.loc[:, ~df.columns.str.contains('ratio', case=False)] + + # exclude_indices = st.session_state.get("tmt_exclude_indices", []) + # group_map = st.session_state.get("tmt_group_map", {}) + # Get group mapping from parameters + param_manager = ParameterManager(Path(workflow_dir)) + params = param_manager.get_parameters_from_json() + group_map = {} + for key, value in params.items(): + if key.startswith("TMT-group-") and value: + # Extract the numeric part from keys like "TMT-group-sample1" + match = re.search(r'sample(\d+)', key) + if match: + # Subtract 1 to convert to a 0-based index (0, 1, 2...). + # If your samples are already 0-based, remove the -1 adjustment. + index = str(int(match.group(1)) - 1) + group_map[index] = value + + # 1. Extract keys labeled as "skip" from group_map as integer list + exclude_indices = [ + int(k) for k, v in group_map.items() if v.lower() == "skip" + ] + + # 2. Remove "skip" entries from group_map (keep only actual group info) + group_map = { + int(k): v for k, v in group_map.items() if v.lower() != "skip" + } + + start_column_offset = 4 + + # st.write("exclude_indices:", exclude_indices) + # st.write("group_map:", group_map) + + if not group_map: + st.warning("⚠️ Group mapping information is missing. Please configure sample groups in the Setup page.") + return None + + if exclude_indices: + # st.write("Current columns:", df.columns.tolist()) + # st.write("Number of columns:", len(df.columns)) + # st.write("Exclude indices:", exclude_indices) + # st.write("Offset:", start_column_offset) + cols_to_drop = [df.columns[i + start_column_offset] for i in exclude_indices] + df_cleaned = df.drop(columns=cols_to_drop) + else: + df_cleaned = df.copy() + + if group_map: + # Create new row data (defaulting to empty strings) + # Create a list with the same length as the column order of df_cleaned + new_row = [""] * len(df_cleaned.columns) + new_row[0] = "Group" + + # Get the column names of the current dataframe as a list + current_cols = df_cleaned.columns.tolist() + original_cols = df.columns.tolist() + + for col_name in current_cols[start_column_offset:]: + # Check the original index position of this column + original_idx = original_cols.index(col_name) - start_column_offset + col_pos = current_cols.index(col_name) + new_row[col_pos] = group_map.get(original_idx, "NA") + + # Insert the row at the top of the dataframe + # Create a new DF and concatenate to prepend the row to existing data + group_df = pd.DataFrame([new_row], columns=df_cleaned.columns) + df_with_groups = pd.concat([group_df, df_cleaned], ignore_index=True) + + # drop_msg = f"{len(exclude_indices)} channels dropped" if exclude_indices else "No channels dropped" + # st.success(f"βœ… {drop_msg} and Group names have been inserted at the top of the data.") + + # st.write("### Data Preview with Group Information") + # st.dataframe(df_with_groups.head(10)) + + if group_map and len(set(group_map.values())) >= 2: + # Prepare data for calculation + # Extract group information from row 0 of df_with_groups (the newly added Group row) + # Actual sample data starts from the 5th column (index 4) + group_info_row = df_with_groups.iloc[0] + + # Get unique group names (excluding NA) + unique_groups = sorted([g for g in set(group_map.values()) if g != "NA"]) + g1_name, g2_name = unique_groups[0], unique_groups[1] + + # Extract numerical data for statistical calculation (from row 1 and column index 4 onwards) + # Convert to numeric type (to prevent calculation errors) + numeric_data = df_with_groups.iloc[1:, 4:].apply(pd.to_numeric, errors='coerce') + + # Column indexing by group + # Categorize columns based on the values in the Group row + g1_cols = [col for col in numeric_data.columns if group_info_row[col] == g1_name] + g2_cols = [col for col in numeric_data.columns if group_info_row[col] == g2_name] + + # Calculate log2FC and p-value for each row + def run_stats(row): + v1 = row[g1_cols].dropna() + v2 = row[g2_cols].dropna() + + # log2FC (Group2 / Group1) + m1, m2 = v1.mean(), v2.mean() + l2fc = np.log2(m2 / m1) if m1 > 0 and m2 > 0 else np.nan + + # p-value (T-test) + if len(v1) > 1 and len(v2) > 1: + _, pval = ttest_ind(v1, v2, equal_var=False) + else: + pval = np.nan + return pd.Series([l2fc, pval]) + + stats_results = numeric_data.apply(run_stats, axis=1) + stats_results.columns = ['log2FC', 'p-value'] + # Add Adjusted p-value (FDR) calculation + if not stats_results['p-value'].isna().all(): + # Select only rows that contain p-values + mask = stats_results['p-value'].notna() + # Apply Benjamini-Hochberg (BH) correction + _, p_adj, _, _ = multipletests(stats_results.loc[mask, 'p-value'], method='fdr_bh') + stats_results.loc[mask, 'p-adj'] = p_adj + else: + stats_results['p-adj'] = np.nan + + # Construct the final dataframe (Based on df_cleaned - excluding the group row) + # Insert calculation results into the 2nd and 3rd column positions + pivot_df = df_cleaned.copy() + pivot_df.insert(1, "log2FC", stats_results['log2FC'].values) + pivot_df.insert(2, "p-value", stats_results['p-value'].values) + pivot_df.insert(3, "p-adj", stats_results['p-adj'].values) + + # st.success(f"Analysis Complete: {g1_name} (n={len(g1_cols)}) vs {g2_name} (n={len(g2_cols)})") + + # Set the first column ('protein') of final_df as the index + protein_col = pivot_df.columns[0] + sample_cols = current_cols[start_column_offset:] # Identify actual sample column names + + # Select sample columns and create a matrix + expr_df = pivot_df.set_index(protein_col)[sample_cols] + + # Replace 0 with NaN (to prevent log transformation errors) + expr_df = expr_df.replace(0, np.nan) + + # Log2 transformation (data normalization) + expr_df = np.log2(expr_df + 1) + + # Remove proteins (rows) with any missing values + expr_df = expr_df.dropna() + + return pivot_df, expr_df, group_map + else: + st.warning("⚠️ At least two distinct groups are required for statistical analysis.") + else: + st.warning("⚠️ No group mapping information is set. Please check the Configure page.") + return None + + # Get group mapping from parameters + # param_manager = ParameterManager(workflow_dir) + # params = param_manager.get_parameters_from_json() + # group_map = { + # key[11:]: value # Remove "mzML-group-" prefix + # for key, value in params.items() + # if key.startswith("mzML-group-") and value + # } + + # if not group_map: + # return None + + # df["Sample"] = df["Reference"].str.replace(".mzML", "", regex=False) + # df["Group"] = df["Reference"].map(group_map) + # df = df.dropna(subset=["Group"]) + + # groups = sorted(df["Group"].unique()) + + # if len(groups) < 2: + # return None + + # group1, group2 = groups[:2] + + # # Compute statistics per protein + # stats_rows = [] + # for protein, protein_df in df.groupby("ProteinName"): + # g1_vals = protein_df[protein_df["Group"] == group1]["Intensity"].values + # g2_vals = protein_df[protein_df["Group"] == group2]["Intensity"].values + + # if len(g1_vals) < 2 or len(g2_vals) < 2: + # pval = np.nan + # else: + # _, pval = ttest_ind(g1_vals, g2_vals, equal_var=False) + + # mean_g1 = np.mean(g1_vals) if len(g1_vals) > 0 else np.nan + # mean_g2 = np.mean(g2_vals) if len(g2_vals) > 0 else np.nan + + # log2fc = np.log2(mean_g2 / mean_g1) if mean_g1 > 0 else np.nan + + # stats_rows.append({ + # "ProteinName": protein, + # "log2FC": log2fc, + # "p-value": pval, + # }) + + # stats_df = pd.DataFrame(stats_rows) + + # if not stats_df.empty: + # mask = stats_df["p-value"].notna() + # if mask.any(): + # _, p_adj, _, _ = multipletests(stats_df.loc[mask, "p-value"], method="fdr_bh") + # stats_df.loc[mask, "p-adj"] = p_adj + # else: + # stats_df["p-adj"] = np.nan + + # # Order samples by group (group2 first, then group1) + # sample_group_df = df[["Sample", "Group"]].drop_duplicates() + # group2_samples = sample_group_df[sample_group_df["Group"] == group2]["Sample"].tolist() + # group1_samples = sample_group_df[sample_group_df["Group"] == group1]["Sample"].tolist() + # all_samples = group2_samples + group1_samples + + # # Build pivot table + # pivot_list = [] + # for protein, group_df in df.groupby("ProteinName"): + # peptides = ";".join(group_df["PeptideSequence"].unique()) + # intensity_dict = group_df.groupby("Sample")["Intensity"].sum().to_dict() + # intensity_dict_complete = { + # sample: intensity_dict.get(sample, 0) + # for sample in all_samples + # } + # row = { + # "ProteinName": protein, + # **intensity_dict_complete, + # "PeptideSequence": peptides, + # } + # pivot_list.append(row) + + # pivot_df = pd.DataFrame(pivot_list) + # pivot_df = pivot_df.merge(stats_df, on="ProteinName", how="left") + # pivot_df = pivot_df[["ProteinName", "log2FC", "p-value", "p-adj"] + all_samples + ["PeptideSequence"]] + + # # Build expression matrix (log2-transformed) + # expr_df = pivot_df.set_index("ProteinName")[all_samples] + # expr_df = expr_df.replace(0, np.nan) + # expr_df = np.log2(expr_df + 1) + # expr_df = expr_df.dropna() + + # return pivot_df, expr_df, group_map + + +def get_abundance_data(workspace: Path) -> tuple | None: + """Wrapper that handles cache key (workspace + CSV mtime). + + Args: + workspace: Path to the workspace directory + + Returns: + Tuple of (pivot_df, expr_df, group_map) or None if data unavailable + """ + workflow_dir = get_workflow_dir(workspace) + quant_dir = workflow_dir / "results" / "quant_results" + + if not quant_dir.exists(): + return None + + csv_files = sorted(quant_dir.glob("*.csv")) + if not csv_files: + return None + + csv_mtime = csv_files[0].stat().st_mtime + return load_abundance_data(str(workspace), csv_mtime) diff --git a/src/workflow/CommandExecutor.py b/src/workflow/CommandExecutor.py index 14d28a32..4c1d4337 100644 --- a/src/workflow/CommandExecutor.py +++ b/src/workflow/CommandExecutor.py @@ -21,6 +21,7 @@ class CommandExecutor: for execution. """ # Methods for running commands and logging + def __init__(self, workflow_dir: Path, logger: Logger, parameter_manager: ParameterManager): self.pid_dir = Path(workflow_dir, "pids") self.logger = logger @@ -268,6 +269,11 @@ def run_topp(self, tool: str, input_output: dict, custom_params: dict = {}, tool # Load merged parameters (_defaults + user overrides) for this tool instance merged_params = self.parameter_manager.get_merged_params(params_key) + flag_map = self.parameter_manager.get_parameters_from_json().get("_flag_params", {}) + if not flag_map: + flag_map = st.session_state.get("_topp_flag_params", {}) + flag_params = set(flag_map.get(params_key, [])) + # Construct commands for each process for i in range(n_processes): command = [tool] @@ -288,28 +294,55 @@ def run_topp(self, tool: str, input_output: dict, custom_params: dict = {}, tool command += [value[i]] # Add merged TOPP tool parameters (_defaults + user overrides) for k, v in merged_params.items(): - command += [f"-{k}"] - # Skip only empty strings (pass flag with no value) - # Note: 0 and 0.0 are valid values, so use explicit check - if v != "" and v is not None: - if isinstance(v, str) and "\n" in v: - command += v.split("\n") + if k in flag_params: + # CLI flag: include "-k" only when enabled + if isinstance(v, str): + is_enabled = v.lower() in {"true", "1", "yes", "on"} else: - command += [str(v)] + is_enabled = bool(v) + if is_enabled: + command += [f"-{k}"] + continue + # For non-flag parameters, skip entirely if empty. + # Note: 0 and 0.0 are valid values, so use explicit checks. + if v == "" or v is None: + continue + command += [f"-{k}"] + if isinstance(v, str) and "\n" in v: + command += v.split("\n") + elif isinstance(v, bool): + command += [str(v).lower()] + else: + command += [str(v)] # Add custom parameters for k, v in custom_params.items(): - command += [f"-{k}"] - # Skip only empty strings (pass flag with no value) - # Note: 0 and 0.0 are valid values, so use explicit check - if v != "" and v is not None: - if isinstance(v, list): - command += [str(x) for x in v] + if k in flag_params: + if isinstance(v, str): + is_enabled = v.lower() in {"true", "1", "yes", "on"} else: - command += [str(v)] + is_enabled = bool(v) + if is_enabled: + command += [f"-{k}"] + continue + if v == "" or v is None: + continue + command += [f"-{k}"] + if isinstance(v, list): + command += [str(x) for x in v] + elif isinstance(v, bool): + command += [str(v).lower()] + else: + command += [str(v)] # Add threads parameter for TOPP tools command += ["-threads", str(threads_per_command)] commands.append(command) + for idx, cmd in enumerate(commands): + # Print list-form command joined into a single string for readability + print(f" πŸ”Ή Command {idx + 1}: {' '.join(cmd)}") + print("==========================================================\n") + + # Run command(s) if len(commands) == 1: return self.run_command(commands[0]) diff --git a/src/workflow/StreamlitUI.py b/src/workflow/StreamlitUI.py index c0880966..ff3dd05e 100644 --- a/src/workflow/StreamlitUI.py +++ b/src/workflow/StreamlitUI.py @@ -31,6 +31,7 @@ class StreamlitUI: """ # Methods for Streamlit UI components + def __init__(self, workflow_dir, logger, executor, parameter_manager): self.workflow_dir = workflow_dir self.logger = logger @@ -612,6 +613,7 @@ def input_TOPP( num_cols: int = 4, exclude_parameters: List[str] = [], include_parameters: List[str] = [], + flag_parameters: List[str] = [], display_tool_name: bool = True, display_subsections: bool = True, display_subsection_tabs: bool = False, @@ -628,6 +630,8 @@ def input_TOPP( num_cols (int, optional): Number of columns to use for the layout. Defaults to 3. exclude_parameters (List[str], optional): List of parameter names to exclude from the widget. Defaults to an empty list. include_parameters (List[str], optional): List of parameter names to include in the widget. Defaults to an empty list. + flag_parameters (List[str], optional): List of parameter names that should + be treated as no-value CLI flags during command construction. display_tool_name (bool, optional): Whether to display the TOPP tool name. Defaults to True. display_subsections (bool, optional): Whether to split parameters into subsections based on the prefix. Defaults to True. display_subsection_tabs (bool, optional): Whether to display main subsections in separate tabs (if more than one main section). Defaults to False. @@ -647,6 +651,16 @@ def input_TOPP( if "_topp_tool_instance_map" not in st.session_state: st.session_state["_topp_tool_instance_map"] = {} st.session_state["_topp_tool_instance_map"][tool_instance_name] = topp_tool_name + if "_topp_flag_params" not in st.session_state: + st.session_state["_topp_flag_params"] = {} + st.session_state["_topp_flag_params"][tool_instance_name] = list(flag_parameters) + # Persist flag metadata so execution still sees it outside UI reruns/session context. + params = self.parameter_manager.get_parameters_from_json() + if "_flag_params" not in params: + params["_flag_params"] = {} + params["_flag_params"][tool_instance_name] = list(flag_parameters) + with open(self.parameter_manager.params_file, "w", encoding="utf-8") as f: + json.dump(params, f, indent=4) if not display_subsections: display_subsection_tabs = False @@ -738,6 +752,7 @@ def _matches_parameter(pattern: str, key: bytes) -> bool: ":".join(key.decode().split(":")[:-1]) ), } + p["is_flag"] = (b"flag" in param.getTags(key)) # Parameter sections and subsections as string (e.g. "section:subsection") if display_subsections: p["sections"] = ":".join( @@ -817,27 +832,58 @@ def display_TOPP_params(params: dict, num_cols): """Displays individual TOPP parameters in given number of columns""" cols = st.columns(num_cols) i = 0 + # st.write("--------------------------------display_TOPP_params------------------------------------") + # st.write(params) for p in params: # get key and name – use tool_instance_name in session state key key_str = p['key'].decode() if tool_instance_name != topp_tool_name: key_str = key_str.replace(f"{topp_tool_name}:1:", f"{tool_instance_name}:1:", 1) key = f"{self.parameter_manager.topp_param_prefix}{key_str}" + # st.write("--------------------------------p['key'](display_TOPP_params loop)------------------------------------") + # st.write(key) name = p["name"] try: # sometimes strings with newline, handle as list if isinstance(p["value"], str) and "\n" in p["value"]: p["value"] = p["value"].split("\n") + # no-value CLI flag parameters should be shown as checkboxes + if p.get("is_flag", False): + flag_default = p["value"] + if isinstance(flag_default, str): + flag_default = flag_default.lower() in {"true", "1", "yes", "on"} + else: + flag_default = bool(flag_default) + # Streamlit widget keys persist in session_state and can override + # updated custom_defaults. Normalize and seed key explicitly. + if key in st.session_state: + current = st.session_state[key] + if isinstance(current, str): + st.session_state[key] = current.lower() in {"true", "1", "yes", "on"} + else: + st.session_state[key] = bool(current) + else: + st.session_state[key] = flag_default + cols[i].selectbox( + name, + options=[True, False], + index=0 if st.session_state[key] else 1, + format_func=lambda x: "True" if x else "False", + help=p["description"], + key=key, + ) # bools - if isinstance(p["value"], bool): - cols[i].markdown("##") - cols[i].checkbox( + elif isinstance(p["value"], bool): + bool_value = ( + (p["value"] == "true") + if type(p["value"]) == str + else p["value"] + ) + cols[i].selectbox( name, - value=( - (p["value"] == "true") - if type(p["value"]) == str - else p["value"] - ), + options=[True, False], + index=0 if bool_value else 1, + format_func=lambda x: "True" if x else "False", help=p["description"], key=key, ) @@ -922,7 +968,6 @@ def on_multiselect_change(dk=display_key, tk=key): cols[i].error(f"Error in parameter **{p['name']}**.") print('Error parsing "' + p["name"] + '": ' + str(e)) - for section, params in param_sections.items(): if tabs is None: show_subsection_header(section, display_subsections) diff --git a/src/workflow/WorkflowManager.py b/src/workflow/WorkflowManager.py index a87fb339..c35ecc55 100644 --- a/src/workflow/WorkflowManager.py +++ b/src/workflow/WorkflowManager.py @@ -194,6 +194,8 @@ def _stop_local_workflow(self) -> bool: """Stop locally running workflow process""" import os import signal + import platform + import subprocess pid_dir = self.executor.pid_dir if not pid_dir.exists(): @@ -203,11 +205,17 @@ def _stop_local_workflow(self) -> bool: for pid_file in pid_dir.iterdir(): try: pid = int(pid_file.name) - os.kill(pid, signal.SIGTERM) + if platform.system() == "Windows": + subprocess.call(["taskkill", "/F", "/T", "/PID", str(pid)], + capture_output=True, text=True) + else: + os.kill(pid, signal.SIGTERM) pid_file.unlink() stopped = True - except (ValueError, ProcessLookupError, PermissionError): - pid_file.unlink() # Clean up stale PID file + except (ValueError, ProcessLookupError, PermissionError, Exception) as e: + print(f"Error stopping process {pid}: {e}") + if pid_file.exists(): + pid_file.unlink() # Clean up stale PID file # Clean up the pid directory shutil.rmtree(pid_dir, ignore_errors=True)