diff --git a/paper/paper.bib b/paper/paper.bib index 56d7e3a..5f0172f 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -147,4 +147,68 @@ @article{Waskom2021 year = {2021}, doi = {10.21105/joss.03021}, url = {https://doi.org/10.21105/joss.03021} +} + +@article{shannon1948mathematical, + title={A mathematical theory of communication}, + author={Shannon, Claude Elwood}, + journal={The Bell system technical journal}, + volume={27}, + number={3}, + pages={379--423}, + year={1948}, + publisher={Nokia Bell Labs} +} + +@article{pielou1966measurement, + title={The measurement of diversity in different types of biological collections}, + author={Pielou, Evelyn C}, + journal={Journal of theoretical biology}, + volume={13}, + pages={131--144}, + year={1966}, + publisher={Elsevier} +} + +@article{chao2002estimating, + title={Estimating the number of species in a stochastic abundance model}, + author={Chao, Anne and Bunge, John}, + journal={Biometrics}, + volume={58}, + number={3}, + pages={531--539}, + year={2002}, + publisher={Wiley Online Library} +} + +@article{bray10jt, + title={JT Curtis An ordination of the upland forest communities of southern Wisconsin., 1957, 27}, + author={Bray, JR}, + journal={DOI: https://doi. org/10.2307/1942268}, + pages={325--349} +} + +@article{jaccard1901etude, + title={{\'E}tude comparative de la distribution florale dans une portion des Alpes et des Jura}, + author={Jaccard, Paul}, + journal={Bull Soc Vaudoise Sci Nat}, + volume={37}, + pages={547--579}, + year={1901} +} + + +@article{ijms26135941, + AUTHOR = {Popov, Ilia V. and Manakhov, Andrey D. and Gorobets, Vladislav E. and Diakova, Kristina B. and Lukbanova, Ekaterina A. and Malinovkin, Aleksey V. and Venema, Koen and Ermakov, Alexey M. and Popov, Igor V.}, + TITLE = {Metagenomic Investigation of Intestinal Microbiota of Insectivorous Synanthropic Bats: Densoviruses, Antibiotic Resistance Genes, and Functional Profiling of Gut Microbial Communities}, + JOURNAL = {International Journal of Molecular Sciences}, + VOLUME = {26}, + YEAR = {2025}, + NUMBER = {13}, + ARTICLE-NUMBER = {5941}, + URL = {https://www.mdpi.com/1422-0067/26/13/5941}, + PubMedID = {40649719}, + ISSN = {1422-0067}, + ABSTRACT = {Bats serve as key ecological reservoirs of diverse microbial communities, including emerging viruses and antibiotic resistance genes. This study investigates the intestinal microbiota of two insectivorous bat species, Nyctalus noctula and Vespertilio murinus, at the Rostov Bat Rehabilitation Center in Southern Russia using whole metagenome shotgun sequencing. We analyzed taxonomic composition, functional pathways, antibiotic resistance genes, and virulence factors. Densoviruses, especially those closely related to Parus major densovirus, were the most dominant viral sequences identified. Metagenome-assembled densovirus genomes showed high sequence similarity with structural variations and clustered phylogenomically with viruses from mealworms and birds, reflecting both dietary origins and the potential for vertebrate infection. Functional profiling revealed microbial pathways associated with cell wall biosynthesis, energy metabolism, and biofilm formation. A total of 510 antibiotic resistance genes, representing 142 unique types, mainly efflux pumps and β-lactamases, were identified. Additionally, 870 virulence factor genes were detected, with a conserved set of iron acquisition systems and stress response regulators across all samples. These findings highlight the ecological complexity of bat-associated microbiota and viromes and suggest that synanthropic bats may contribute to the circulation of insect-associated viruses and antimicrobial resistance in urban settings.}, + DOI = {10.3390/ijms26135941} } \ No newline at end of file diff --git a/paper/paper.md b/paper/paper.md index 6338d14..e3d2bf6 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -13,41 +13,47 @@ affiliations: - name: Faculty of Bioengineering and Veterinary Medicine, Don State Technical University, Russia index: 1 ror: 00x5je630 -date: 1 June 2025 +date: 3 June 2026 bibliography: paper.bib --- # Summary -`KrakenParser` is an open-source software tool (with a command-line interface and Python API) designed to streamline the post-analysis of metagenomic classification results produced by `Kraken2` [@wood2019kraken2] and similar taxonomic profilers such as `Bracken` [@lu2017bracken] and `Metabuli` [@kim2024metabuli]. `Kraken2` is a widely used taxonomic classifier that assigns metagenomic reads to taxa using exact k-mer matches, achieving high speed and accuracy. However, the raw output of `Kraken2` [@wood2019kraken2] (and related tools) is a text report that can be cumbersome to interpret and aggregate across multiple samples. `KrakenParser` addresses this need by converting multiple `Kraken`-format reports into structured tables (CSV files) at various taxonomic ranks (from phylum down to species), performing filtering and normalization (including relative abundance calculations), and providing APIs to produce publication-ready plots. The tool automates the multi-step process of combining and cleaning `Kraken` results, allowing researchers to quickly obtain human-readable summaries of community composition. `KrakenParser`’s focus is on efficiency, ease-of-use, and integration: it can run an entire conversion pipeline with a single command and also be imported as a Python library for custom workflows. In summary, `KrakenParser` significantly reduces the manual effort required to post-process metagenomic classification data, enabling scientists to go from raw classifier output to analysis-ready tables and figures in one step. +`KrakenParser` is an open-source Python library and command-line interface (CLI) designed to automate andคู่ standardize the post-processing of metagenomic taxonomic classification reports generated by `Kraken2` [@wood2019kraken2], `Bracken` [@lu2017bracken], and `Metabuli` [@kim2024metabuli]. Although these high-throughput profilers provide rapid taxonomic assignments, their primary outputs comprise independent, hierarchical text files that are challenging to aggregate, filter, and analyze across multi-sample cohorts. `KrakenParser` addresses this bottleneck by parsing and merging separate report files into unified, rank-specific (from phylum down to species) count matrices formatted as tidy CSV tables. Additionally, the tool provides advanced taxonomic filtering, normalizes raw counts into relative abundances, computes ecological alpha and beta diversity metrics, and offers an object-oriented API for publication-ready visualizations. `KrakenParser` can be deployed as an automated end-to-end pipeline via a single CLI command or integrated as a modular library within custom bioinformatics workflows. # Statement of need -Analyzing the taxonomic profiles of metagenomic samples often involves running k-mer based classifiers (like `Kraken2`) that generate detailed reports of read counts and abundances across taxa. These reports, while information-rich, are not immediately convenient for comparative analysis: they list each taxon in a hierarchical format for a single sample, and researchers must manually parse and merge multiple files to compare communities across samples. Existing scripts such as the `KrakenTools` suite [@lu2022kraken] (developed alongside `Kraken`) provide some post-processing functionality, but they require multiple steps and technical expertise to use. Similarly, interactive tools like `Pavian` focus on visualization and exploration of `Kraken` results rather than automated batch processing [@breitwieser2020pavian]. There is a clear need for a streamlined solution to transform raw `Kraken`-family outputs into tidy data matrices and summary statistics that can be readily used in downstream analysis or publication figures. `KrakenParser` fulfills this need by offering an all-in-one pipeline that reads in multiple `Kraken2`/`Bracken`/`Metabuli` reports and outputs clean CSV tables of taxonomic counts or relative abundances, optionally filtering out low-abundance taxa or non-target taxa (e.g. human reads) as specified by the user. This greatly simplifies metagenomic workflows, especially in comparative studies or clinical settings where dozens of samples must be processed consistently. By bridging the gap between raw classifier output and statistical analysis, KrakenParser empowers researchers who may not be bioinformatics experts to leverage high-throughput metagenomics with minimal data wrangling. +Comparative metagenomics and microbiome studies depend fundamentally on cross-sample data matrices to perform downstream statistical profiling and ecological modeling. Existing utilities only partially address this requirement. The `KrakenTools` suite [@lu2022kraken] provides low-level text manipulation scripts but lacks an integrated, automated execution framework, requiring manual multi-step invocations and custom scripting to generate structured matrices. Graphical platforms like `Pavian` [@breitwieser2020pavian] excel at interactive data exploration but are poorly suited for headless batch execution, high-performance computing (HPC) clusters, or continuous integration pipelines. Lightweight scripts, such as `spideog`, convert reports to flat tables but do not provide embedded statistical normalization or plotting capabilities. -Metagenomic classification has seen rapid development, with numerous tools available for assigning sequencing reads to taxa. `Kraken` was introduced in 2014 as an ultrafast k-mer based classifier [@wood2014kraken], and its successor `Kraken2` [@wood2019kraken2] further reduced memory usage and improved speed . Other k-mer classifiers include `Bracken` [@lu2017bracken], which refines `Kraken`’s counts to improve abundance estimates, `KrakenUniq` which tracks unique k-mers per taxon to reduce false positives [@breitwieser2018krakenuniq], `Centrifuge` which uses an FM-index to allow classification with compressed databases [@kim2016centrifuge], and `CLARK` which uses discriminative k-mers for fast classification [@ounit2015clark]. More recently, tools like `Kaiju` perform classification in protein space for greater sensitivity (especially on viruses) [@menzel2016kaiju], and `Metabuli` combines DNA and translated amino acid matching to improve accuracy [@kim2024metabuli]. Comprehensive evaluations have benchmarked these methods’ accuracy and speed, and community challenges like `CAMI` have pushed development of improved classifiers [@sczyrba2017cami]. Despite the variety of classifiers, a common challenge remains: the output format. Many tools output reports similar to `Kraken`’s: tab-delimited text with hierarchical labels and counts. To interpret such outputs, researchers often rely on additional scripts or manual processing. `KrakenTools` [@lu2022kraken] provides scripts to combine `Kraken` reports, convert to other formats (e.g., `Krona` for visualization). `Pavian` and other interactive platforms allow users to visualize results with `Sankey` diagrams and heatmaps [@breitwieser2020pavian], but require use of a web interface or `R` environment. There are also lightweight utilities (e.g., [`spideog`](https://github.com/jeanmanguy/spideog)) to convert Kraken reports to CSV or clean them, and researchers adept in programming sometimes write custom parsing scripts. In summary, prior to `KrakenParser`, users had to piece together multiple tools to achieve tasks like merging reports from multiple samples, summing reads at specific taxonomic ranks, and computing relative abundances. `KrakenParser` builds on this state of the field by consolidating the post-processing steps into one tool. It serves as an ideological successor to `KrakenTools` [@lu2022kraken], using some of the same internal conversion steps (like `KrakenTools`’ report-to-MPA conversion) but adding improvements in automation, filtering, and output formatting. By producing standardized CSV tables (with samples as rows and taxa as columns) and by computing percentages automatically, `KrakenParser` greatly accelerates the transition from raw classification data to biological insights. This is particularly valuable given the increasing scale of metagenomic studies (where dozens or hundreds of samples are profiled) and the need for reproducible, efficient analysis pipelines. +`KrakenParser` consolidates these fragmented workflows into a single production-ready environment. Since the inception of fast k-mer based classification [@wood2014kraken], the taxonomic profiling ecosystem has evolved to include diverse specialized algorithms such as `Centrifuge` [@kim2016centrifuge], `CLARK` [@ounit2015clark], `Kaiju` [@menzel2016kaiju], `KrakenUniq` [@breitwieser2018krakenuniq], and standardized benchmarking standards like `CAMI` [@sczyrba2017cami]. However, the lack of standardized downstream processing tools remains a barrier. As an ideological successor to `KrakenTools` [@lu2022kraken], `KrakenParser` abstracts low-level file parsing into a robust, scalable pipeline, allowing researchers to transition from raw outputs to ecological indices and visualizations without intermediate data-wrangling scripts, thereby ensuring absolute reproducibility in large-scale cohort studies. -# Implementation +# Implementation and Software Design -`KrakenParser` is implemented in `Python` (available via `PyPI` as `krakenparser`) with several auxiliary scripts. It leverages the original `KrakenTools` [@lu2022kraken] scripts for initial data reshaping and then applies its own pure-`Python` processing for downstream formatting. The software follows a pipeline of six main steps, which can be executed automatically in sequence (`--complete` mode) or run individually as needed: +`KrakenParser` is implemented in Python 3 (distributed via PyPI as `krakenparser`) and follows a modular architecture split into three distinct operational layers: Data Processing, Statistical Analysis, and Visualization. The pipeline can be executed in an end-to-end automated mode by providing global input and output paths directly to the main command, or controlled step-by-step through granular subcommands. -1. Convert reports to MPA format: Each `Kraken2`/`Bracken`/`Metabuli` report (text file with taxon lines) is converted to an “MPA” table format using `KrakenTools`’ `kreport2mpa.py` script. In MPA format, each row corresponds to a read and columns correspond to taxonomic ranks, allowing easy combination of multiple samples. -2. Combine MPA files: All per-sample MPA files are merged into a single master table (samples × taxa) using `KrakenTools`’ combine_mpa.py. This yields a matrix of raw read counts, with entries where a taxon is absent in a sample filled with zero. -3. Deconstruct taxonomic levels: The combined data is split out by rank. `KrakenParser` extracts separate text files for phylum, class, order, family, genus, and species counts. During this step, it can optionally isolate certain domains; for example, using `--deconstruct_viruses` will produce a file of only viral species counts, ignoring other domains. Also, the default `--deconstruct` excludes reads classified as human to focus on microbial content. -4. Process extracted data: Each rank-specific text file is cleaned and formatted. `KrakenParser` removes classification prefixes (like “s__” for species, “g__” for genus) and replaces underscores with spaces for readability. This step ensures taxon names are human-friendly (e.g. “s__Escherichia_coli” becomes “Escherichia coli”). -5. Convert to CSV: The cleaned text tables are converted to CSV files (comma-separated values). In this transpose operation, taxa become columns and sample identifiers become rows, yielding a standard matrix format. This structured CSV is easy to import into statistical software, spreadsheets, or R/Python data frames for further analysis. -6. Calculate relative abundances: For each count table, `KrakenParser` can create a corresponding relative abundance table (`--relabund` option) by computing percentages of total reads per sample, using the formula: $\text{Relative Abundance} = \left( \frac{\text{Number of individuals of taxa}}{\text{Total number of individuals of all taxa}} \right) \times 100$. Users can specify a threshold to group low-abundance taxa into an “Other” category. This results in a normalized profile for each sample, often more interpretable in comparative studies than raw counts. +## Data Processing and Filtering +Individual taxonomic reports are programmatically parsed, converted into MetaPhlAn (MPA) tables, and merged into a unified cross-sample master count matrix. This matrix is subsequently deconstructed into distinct tables for each major taxonomic rank. During deconstruction, `KrakenParser` purges internal structural prefixes (e.g., stripping `s__` from species names) and normalizes taxonomic strings by replacing underscores with spaces to ensure human readability and compatibility with downstream software. -Each of these steps is exposed as a sub-command in the CLI, so advanced users can integrate KrakenParser into custom workflows. By default, running `KrakenParser --complete -i /kreports` executes all steps sequentially, writing outputs to a structured directory tree (with subfolders for each step). The outputs include one CSV file per rank (e.g. counts_phylum.csv, counts_species.csv) containing absolute read counts, and similarly named files under a `csv_relabund/` directory for percentages if requested. KrakenParser is optimized for speed and memory efficiency given the nature of the task: it processes text files line by line and uses `pandas` data frames for merging and calculations, which easily handle dozens of samples and tens of thousands of taxa on a standard workstation. The reliance on `KrakenTools` for the initial conversion ensures that the parsing logic benefits from the robustness of well-tested scripts, while the unified interface adds convenience. The tool also includes built-in help for each subcommand (`-h`), guiding users on required inputs and options. `KrakenParser`’s design reflects practical needs observed in the metagenomics community - it was tested during the [2025 “Bioinformatics Bootcamp”](https://pish.itmo.ru/genomics-bootcamp) hackathon organized by ITMO University, where teams analyzing metagenomic datasets were able to obtain meaningful results in a short time thanks to `KrakenParser`’s streamlined processing pipeline. By combining established methods with new automation, `KrakenParser` provides an efficient, reproducible, and user-friendly means to handle the otherwise tedious steps of post-classification data processing. +The core data engine features flexible filtering mechanisms. Users can selectively isolate or exclude specific biological domains or kingdoms (Bacteria, Viruses, Archaea, Fungi) during extraction. While non-target host reads (e.g., human contamination) are filtered out by default to focus on microbial signatures, the `--keep-human` flag preserves host read counts within the output matrices. Crucially, `--keep-human` can be combined concurrently with domain-specific filters, allowing the simultaneous evaluation of host-to-microbe or host-to-pathogen abundance ratios within a single run. -`KrakenParser` also offers a suite of `Python`-based visualization tools to facilitate the interpretation of taxonomic profiles: +## Statistical Analysis +Following matrix generation, the statistical module computes normalization metrics and ecological indices directly: -- Stacked Bar Plots: Utilizing `matplotlib` [@Hunter2007] and `pandas` [@reback2020pandas], `KrakenParser` can generate stacked bar plots that display the relative abundances of taxa across multiple samples. These plots provide a clear comparison of taxonomic compositions between samples. -- Streamgraphs: For a more dynamic representation, `KrakenParser` can create streamgraphs using `matplotlib`’s [@Hunter2007] stackplot function with a symmetric baseline. This visualization emphasizes changes in taxa abundances over a series of samples, highlighting temporal or sequential patterns.  -- Combined Visualizations: To offer both detailed and overarching views, `KrakenParser` supports combined plots that integrate stacked bar plots and streamgraphs. This dual representation aids in comprehensive data analysis. -- Clustermaps: Employing `seaborn` [@Waskom2021], `KrakenParser` can produce clustermaps that perform hierarchical clustering on taxa and samples. These heatmaps reveal patterns and groupings in the data, facilitating the identification of similar taxonomic profiles. +* **Relative Abundance:** Normalizes absolute counts into percentage distributions using the formula: $\text{Relative Abundance} = \left( \frac{\text{Number of individuals of taxa}}{\text{Total number of individuals of all taxa}} \right) \times 100$. A user-defined abundance threshold aggregates rare background taxa into a consolidated `Other` category to simplify downstream parsing and plotting. +* **Alpha Diversity:** Calculates *Shannon* [@shannon1948mathematical], *Pielou’s evenness* [@pielou1966measurement], and *Chao1* [@chao2002estimating] indices. To mitigate artifacts caused by uneven sequencing depths across different sequencing runs, a built-in rarefaction procedure subsamples reads to a uniform user-specified depth prior to calculating indices. +* **Beta Diversity:** Computes compositional dissimilarity between samples via *Bray-Curtis* [@bray10jt] and *Jaccard* [@jaccard1901etude] distance metrics, exporting standard distance matrices ready for ordination. -These visualization tools are accessible through the `KrakenParser` Python API, allowing users to customize and integrate them into their analysis workflows seamlessly. +## Visualization +The `kpplot` module utilizes an object-oriented design inheriting from a unified base configuration class (`KpPlotBase`), enforcing consistent rendering properties such as DPI, bounding box scaling, and layout properties. Built on top of `matplotlib` [@Hunter2007], `pandas` [@reback2020pandas], and `seaborn` [@Waskom2021], the visualization engine exposes four primary programmatic layouts: + +* **Stacked Bar Plots:** For comparing relative taxonomic proportions across multi-sample cohorts. +* **Streamgraphs:** Using symmetric baselines to map continuous structural transitions or temporal trends in taxonomic composition. +* **Combined Visualizations:** Integrating stacked bars and streamgraphs into synchronized multi-panel layouts. +* **Clustermaps:** Applying unsupervised hierarchical clustering to reveal sample similarities and taxonomic co-occurrence blocks. + +# Research Impact + +The functional reliability and execution integrity of `KrakenParser` are validated via automated continuous integration workflows. The utility and user readiness of the software were demonstrated during the [2025 “Bioinformatics Bootcamp”](https://pish.itmo.ru/genomics-bootcamp) hackathon. Furthermore, the core structural prototype of this tool was successfully utilized for large-scale metagenomic data analysis by Popov et al., 2025 [@ijms26135941]. # Acknowledgements