From 10912c087c99d6f8888e2ea5f0b7b1b4dca3e89c Mon Sep 17 00:00:00 2001 From: Aaron Ragsdale Date: Thu, 5 Mar 2026 11:05:46 -0600 Subject: [PATCH 1/5] Edit trees.py to add docstring to ld_matrix() --- python/tskit/trees.py | 81 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 79 insertions(+), 2 deletions(-) diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 45d2da59e0..8ce99280c1 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -10930,12 +10930,89 @@ def impute_unknown_mutations_time( def ld_matrix( self, sample_sets=None, - sites=None, - positions=None, mode="site", stat="r2", + sites=None, + positions=None, indexes=None, ): + r""" + + Returns a matrix of the specified two-locus statistic (default + :math:`r^2`) computed from sample allelic states or branch lengths. + The resulting linkage disequilibrium (LD) matrix represents either the + two-locus statistic as computed between all pairs of specified + ``sites`` ("site" mode, producing a num_sites-by-num_sites sized + matrix), or as computed from the branch structures at marginal trees + between pairs of trees at all specified ``positions`` ("branch" mode, + producing a num_positions-by-num_positions sized matrix). + + In the site mode, the sites under consideration can be restricted using + the ``sites`` argument. Sites can be passed as a list of lists, + specifying the ``[[row_sites], [col_sites]]``, resulting in a + rectangular matrix, or by specifying a single list of ``[sites]``, in + which a square matrix will be produced (see + :ref:`sec_stats_two_locus_site` for examples). + + Similarly, in the branch mode, the ``positions`` argument specifies + loci for which the expectation for the two-locus statistic is computed + over pairs of trees at those positions. LD statis are computed between + trees whose ``[start, end)`` contains the given position (such that + repeats of trees are possible). Similar to the site mode, a nested list + of row and column positions can be specified separately (resulting in a + rectangular matrix) or a single list of a specified positions results + in a square matrix (see :ref:`sec_stats_two_locus_branch` for + examples). + + Some LD statistics are defined for two sample sets instead of within a + single set of samples. If the ``indexes`` argument is specified, at + least two sample sets must also be specified. ``indexes`` specifies the + sample set indexes between which to compute LD. + + For more on how the ``indexes`` and ``sample_sets`` interact with the + output dimensions, see the :ref:`sec_stats_two_locus_sample_sets` + section. + + **Available Stats** (use ``Stat Name`` in the ``stat`` keyword + argument). + + ======================= ========== ================ ============== + Stat Polarised Multi Sample Set Stat Name + ======================= ========== ================ ============== + :math:`r^2` n y "r2" + :math:`r` y n "r" + :math:`D^2` n y "D2" + :math:`D` y n "D" + :math:`D'` y n "D_prime" + :math:`D_z` n n "Dz" + :math:`\pi2` n n "pi2" + :math:`\widehat{D^2}` n y "D2_unbiased" + :math:`\widehat{D_z}` n n "Dz_unbiased" + :math:`\widehat{\pi_2}` n n "pi2_unbiased" + ======================= ========== ================ ============== + + :param list sample_sets: A list of lists of sample Node IDs, specifying + the groups of nodes to compute the statistic with. Defaults to all + samples grouped by population. + :param str mode: A string giving the "type" of the statistic to be + computed. Defaults to "site", can be "site" or "branch". + :param str stat: A string giving the selected two-locus statistic to + compute. Defaults to "r2". + :param list sites: A list of sites over which to compute LD. Can be + specified as a list of lists to control the row and column sites. + Only applicable in site mode. Specify as + ``[[row_sites], [col_sites]]`` or ``[all_sites]``. + :param list positions: A list of genomic positions where expected LD is + computed. Only applicable in branch mode. Can be specified as a list + of lists to control the row and column positions. Specify as + ``[[row_positions], [col_positions]]`` or ``[all_positions]``. + :param list indexes: A list of 2-tuples or a single 2-tuple, specifying + the indexes of two sample sets over which to compute a two-way LD + statistic. Only :math:`r^2`, :math:`D^2`, and :math:`\widehat{D^2}` + are implemented for two-way statistics. + :return: A 2D or 3D array of LD matrices. + :rtype: numpy.ndarray + """ one_way_stats = { "D": self._ll_tree_sequence.D_matrix, "D2": self._ll_tree_sequence.D2_matrix, From a2c59d5bffb9ba51ccde62597988cdf149d6858a Mon Sep 17 00:00:00 2001 From: Aaron Ragsdale Date: Thu, 5 Mar 2026 18:38:08 -0600 Subject: [PATCH 2/5] Add, edit, streamline two-locus docs --- docs/stats.md | 243 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 243 insertions(+) diff --git a/docs/stats.md b/docs/stats.md index 87a2ed5f83..e320eb9046 100644 --- a/docs/stats.md +++ b/docs/stats.md @@ -684,8 +684,251 @@ and boolean expressions (e.g., {math}`(x > 0)`) are interpreted as 0/1. and {math}`v_j` is the covariance of the trait with the j-th covariate. +(sec_stats_multi_site)= + ## Multi site statistics +(sec_stats_two_locus)= + +### Two-locus statistics + +The {meth}`~TreeSequence.ld_matrix` method provides an interface to +a collection of two-locus statistics with predefined summary functions (see +{ref}`sec_stats_two_locus_summary_functions`) and `site` and `branch` +{ref}`modes `. The LD matrix method differs from other +statistics methods in that it provides a unified API with an argument to +specify different two-locus summaries of the data. It otherwise behaves +similarly to most other functions with respect to `sample_sets` and `indexes`. + +Two-locus statistics can be computed using two modes, either `sites` or +`branch`, and these should be interpreted in the same way as these modes in the +single-site statistics. Site statistics allow for multi-allelic data, while +branch statistics assume an infinite sites model. Within this framework, we +also implement polarisation, but do not expose it to the user, opting to +provide statistics that are polarised where appropriate. + +(sec_stats_two_locus_site)= + +#### Site + +The `site` mode computes two-locus statistics summarized over alleles between +all pairs of specified sites. The default behavior, leaving `sites` +unspecified, this method will compute a matrix for all pairs of sites, with +rows and columns representing each site in the tree sequence (i.e., an n×n +matrix where n is the number of sites in the tree sequence). We can also +restrict the output to a subset of sites, either by specifying a single vector +for both rows and columns or a pair of vectors for the row sites and column +sites separately. + +The following computes a matrix of the {math}`r^2` measure of linkage +disequilibrium (LD) computed pairwise between the first 4 sites in the tree +sequence among all samples. In our computations, row sites are used as the row +("left-hand") loci and column sites are used as the column ("right-hand") +locus, and with a single list of sites specified, we obtain a symmetric square +matrix. + +```{code-cell} ipython3 +ld = ts.ld_matrix(sites=[[0, 1, 2, 3]]) +print(ld) +``` + +The following demonstrates how we can specify the row and column sites +independently of each other. We're specifying 3 columns and 2 rows, which +computes a subset of the matrix shown above. + +```{code-cell} ipython3 +ld = ts.ld_matrix(sites=[[0, 1], [1, 2, 3]]) +print(ld) +``` + +Because we implement two-locus statistics for multi-allelic data, we require +a method for combining the statistical results from each pair of alleles into +one summary for a pair of sites. There two methods for combining results from +multiple alleles, `hap_weighted` and `total_weighted`, which are +statistic-specific and not chosen by the user: + +(sec_stats_two_locus_branch)= + +#### Branch + +The `branch` mode computes two-locus statistics between pairs of trees. The +resulting statistic is a summary depending on marginal tree topologies and the +branch lengths of the trees at a pair of positions. To perform this +computation, we consider the counts of all possible two-locus haplotypes that +may be generated by possible mutations on each pair of branches between the two +trees, which is summarized using an LD statistic. We then sum each pairwise +comparison weighted by the product of the two branch lengths above the nodes on +each tree. + +Similar to the single-site statistics computed in `branch` mode, this results +in a statistic that is proportional to the expected statistic under an infinite +sites model (which mutation rate 1), conditioned on the pair of trees. + +The time complexity of this method is quadratic, due to the pairwise +comparisons of branches from a pair of trees. By default, this method computes +a symmetric matrix for all pairs of trees, with rows and columns representing +each tree in the tree sequence. Similar to the site method, we can restrict the +output to a subset of trees, either by specifying a vector of positions or +a pair of vectors for row and column positions separately. To select a specific +tree, the specified positions must land in the tree span (`[start, end)`). + +In the following, we compute a matrix of expected {math}`r^2` within and +between the first 4 trees in the tree sequence. The tree breakpoints are +a convenient way to specify those first four trees tree. + +```{code-cell} ipython3 +ld = ts.ld_matrix(mode="branch", positions=[ts.breakpoints(as_array=True)[0:4]]) +print(ld) +``` + +Again, we can specify the row and column trees separately. + +```{code-cell} ipython3 +breakpoints = ts.breakpoints(as_array=True) +ld = ts.ld_matrix(mode="branch", positions=[breakpoints[[0]], breakpoints[0:4]]) +print(ld) +``` + +#### Sample Sets + +The two-locus API provides a mechanism by which to subset the samples under +consideration, providing the ability to compute a separate LD matrix for each +sample set or an LD matrix between sample sets. Output dimensions are handled +in the same manner as the rest of the stats API (see +{ref}`sec_stats_output_dimensions`). The two-locus API differs from the rest of +the stats API in that we handle one-way and two-way statistics in the same +function call. + +To compute a two-way two-locus statistic, the `index` argument must be +provided. The statistics are selected in the same way (with the `stat` +argument), but we provide a restricted set of two-way statistics (see +{ref}`sec_stats_two_locus_summary_functions_two_way`). The dimension-dropping +rules for the result follow the rest of the tskit stats API in that scalar +indexes will produce a single two-dimensional matrix, while list of indexes +will produce a three-dimensional array, with the outer dimension of length +equal to the length of the list of indexes. + +For concreteness, we would expect the following dimensions with the specified +`sample_sets` and `indexes` arguments. + +``` +# one-way +ts.ld_matrix(sample_sets=None) -> 2 dimensions +ts.ld_matrix(sample_sets=[0, 1, 2, 3]) -> 2 dimensions +ts.ld_matrix(sample_sets=[[0, 1, 2, 3]]) -> 3 dimensions +# two-way +ts.ld_matrix(sample_sets=[[0, 1, 2, 3], [4, 5, 6, 7]], indexes=(0, 1)) -> 2 dimensions +ts.ld_matrix(sample_sets=[[0, 1, 2, 3], [4, 5, 6, 7]], indexes=[(0, 1)]) -> 3 dimensions +``` + +(sec_stats_two_locus_sample_one_way_stats)= + +##### One-way Statistics + +One-way statistics are summaries of two loci in a single sample set, using a +triple of haplotype counts {math}`\{w_{Ab}, w_{aB}, w_{AB}\}` and the size of +the sample set {math}`n`, where the capitalized and lowercase letters in our notation +represent alternate alleles. + +(sec_stats_two_locus_sample_two_way_stats)= + +##### Two-way Statistics + +Two-way statistics are summaries of haplotype counts between two sample sets, +which operate on the three haplotype counts (as in one-way stats, above) +computed from each sample set, indexed by `(i, j)`. These statistics take on +a different meaning from their one-way counterparts. For instance {math}`D_i +D_j` can be thought of as the covariance between two loci when computed for +a haplotype in a single sample set. + +Only a subset of our summary functions are two-way statistics (see +{ref}`sec_two_locus_summary_functions_two_way`). Note that the unbiased two-way +statistics expect non-overlapping sample sets, and we do not make any +assertions about the sample sets and assume that `i` and `j` represent disjoint +sets of samples (see also the note in {meth}`~TreeSequence.divergence`). + +(sec_stats_two_locus_summary_functions)= + +#### Summary Functions + +(sec_stats_two_locus_summary_functions_one_way)= + +##### One-way + +The two-locus summary functions all take haplotype counts and sample set size as +input. Each of our summary functions has the signature +{math}`f(w_{Ab}, w_{aB}, w_{AB}, n)`, converting to haplotype frequencies +{math}`\{p_{Ab}, p_{aB}, p_{AB}\}` where appropriate by dividing by {math}`n`. + +`D` +: {math}`f(w_{Ab}, w_{aB}, w_{AB}, n) = p_{ab} - p_{a}p_{b}` + + This statistic is polarised, as the unpolarised result, which averages over + allele labelings, is zero. Uses the `total` normalisation method. + +`D_prime` +: {math}`f(w_{Ab}, w_{aB}, w_{AB}, n) = \frac{D}{D_{\max}}` + + Where {math}``` + D_{\max} = \begin{cases} + \min\{p_A (1-p_B), p_B (1-p_B)\} & \textrm{if }D>=0 \\ + \min\{p_A p_B, (1-p_B) (1-p_B)\} & \textrm{otherwise} + \end{cases}``` + + and {math}`D` is defined above. + +`D2` +: {math}`f(w_{Ab}, w_{aB}, w_{AB}, n) = (p_{ab} - p_{a} p_{b})^2` + + Unpolarised, total weighted. + +`Dz` +: {math}`f(w_{Ab}, w_{aB}, w_{AB}, n) = D (1 - 2 p_{a})(1 - 2p_{b})` + + Where {math}`D` is defined above. Unpolarised, total weighted. + +`pi2` +: {math}`f(w_{Ab}, w_{aB}, w_{AB}, n) = p_{a}p_{b}(1-p_{a})(1-p_{b})` + + Unpolarised, total weighted. + +`r` +: {math}`f(w_{Ab}, w_{aB}, w_{AB}, n) = \frac{D}{\sqrt{p_{a}p_{b}(1-p_{a})(1-p_{b})}}` + + Where {math}`D` is defined above. Polarised, total weighted. + +`r2` +: {math}`f(w_{Ab}, w_{aB}, w_{AB}, n) = \frac{D^{2}}{p_{a}p_{b}(1-p_{a})(1-p_{b})}` + + Where {math}`D` is defined above. Unpolarised, haplotype weighted. + +Unbiased two-locus statistics from the Hill-Robertson (1968) system are +computed from haplotype counts. Derivations for these unbiased estimators can +be found in [Ragsdale and Gravel +(2020)](https://doi.org/10.1093/molbev/msz265). They require at least 4 samples +to be valid and are specified as `stat=D2_unbiased`, `Dz_unbiased`, or +`pi2_unbiased`. + +(sec_two_locus_summary_functions_two_way)= + +(sec_stats_two_locus_summary_functions_two_way)= + +##### Two-way + +Two-way statistics are indexed by sample sets {math}`i, j` and compute values +using haplotype counts within pairs of sample sets. + +`D2` +: {math}`f(w_{Ab}, w_{aB}, w_{AB}, n) = D_i * D_j` + +Where {math}`D` is defined above. + +`r2` +: {math}`f(w_{Ab}, w_{aB}, w_{AB}, n) = \frac{(p_{AB_i} - (p_{A_i} p_{B_i})) (p_{AB_j} - (p_{A_j} p_{B_j}))}{\sqrt{p_{A_i} (1 - p_{A_i}) p_{B_i} (1 - p_{B_i})}\sqrt{p_{A_j} (1 - p_{A_j}) p_{B_j} (1 - p_{B_j})}}` + +And `D2_unbiased`, which can be found in [Ragsdale and Gravel +(2020)](https://doi.org/10.1093/molbev/msz265). + :::{todo} Document statistics that use information about correlation between sites, such as LdCalculator (and perhaps reference {ref}`sec_identity`). Note that if we have a general From 51ab754c1855c0b0aff83ec9859c8a821fcc5a8d Mon Sep 17 00:00:00 2001 From: Aaron Ragsdale Date: Thu, 5 Mar 2026 18:48:14 -0600 Subject: [PATCH 3/5] add section title --- docs/stats.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/stats.md b/docs/stats.md index e320eb9046..94442ee445 100644 --- a/docs/stats.md +++ b/docs/stats.md @@ -789,6 +789,8 @@ ld = ts.ld_matrix(mode="branch", positions=[breakpoints[[0]], breakpoints[0:4]]) print(ld) ``` +(sec_stats_two_locus_sample_sets)= + #### Sample Sets The two-locus API provides a mechanism by which to subset the samples under From fd6e6f7d7293d4070051b0f418efdeb0d1345889 Mon Sep 17 00:00:00 2001 From: Aaron Ragsdale Date: Fri, 6 Mar 2026 08:27:03 -0600 Subject: [PATCH 4/5] Address comments from Jerome --- docs/stats.md | 18 +++++------------- python/tskit/trees.py | 2 +- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/docs/stats.md b/docs/stats.md index 94442ee445..cf256c4eaa 100644 --- a/docs/stats.md +++ b/docs/stats.md @@ -700,12 +700,11 @@ statistics methods in that it provides a unified API with an argument to specify different two-locus summaries of the data. It otherwise behaves similarly to most other functions with respect to `sample_sets` and `indexes`. -Two-locus statistics can be computed using two modes, either `sites` or +Two-locus statistics can be computed using two modes, either `site` or `branch`, and these should be interpreted in the same way as these modes in the -single-site statistics. Site statistics allow for multi-allelic data, while -branch statistics assume an infinite sites model. Within this framework, we -also implement polarisation, but do not expose it to the user, opting to -provide statistics that are polarised where appropriate. +single-site statistics. Within this framework, we also implement polarisation, +but do not expose it to the user, opting to provide statistics that are +polarised where appropriate. (sec_stats_two_locus_site)= @@ -762,7 +761,7 @@ each tree. Similar to the single-site statistics computed in `branch` mode, this results in a statistic that is proportional to the expected statistic under an infinite -sites model (which mutation rate 1), conditioned on the pair of trees. +sites model (with mutation rate 1), conditioned on the pair of trees. The time complexity of this method is quadratic, due to the pairwise comparisons of branches from a pair of trees. By default, this method computes @@ -931,13 +930,6 @@ Where {math}`D` is defined above. And `D2_unbiased`, which can be found in [Ragsdale and Gravel (2020)](https://doi.org/10.1093/molbev/msz265). -:::{todo} -Document statistics that use information about correlation between sites, such as -LdCalculator (and perhaps reference {ref}`sec_identity`). Note that if we have a general -framework which has the same calling conventions as the single site stats, -we can rework the sections above. -::: - (sec_stats_notes)= diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 8ce99280c1..fddd7b5e37 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -10956,7 +10956,7 @@ def ld_matrix( Similarly, in the branch mode, the ``positions`` argument specifies loci for which the expectation for the two-locus statistic is computed - over pairs of trees at those positions. LD statis are computed between + over pairs of trees at those positions. LD stats are computed between trees whose ``[start, end)`` contains the given position (such that repeats of trees are possible). Similar to the site mode, a nested list of row and column positions can be specified separately (resulting in a From bae40654f21af059992b382931b34267f2fb6b0f Mon Sep 17 00:00:00 2001 From: Aaron Ragsdale Date: Tue, 10 Mar 2026 11:42:08 -0500 Subject: [PATCH 5/5] Add changelog change --- python/CHANGELOG.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index 916412cc81..bc9b6217da 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -7,6 +7,12 @@ In development - Add ``json+struct`` metadata codec that allows storing binary data using a struct schema alongside JSON metadata. (:user:`benjeffery`, :pr:`3306`) +**Features** + +- Add ``TreeSequence.ld_matrix`` stats method and documentation, for computing + two-locus statistics in site and branch mode. + (:user:`lkirk`, :user:`apragsdale`, :pr:`3416`) + -------------------- [1.0.2] - 2026-03-06 --------------------