From a1eeb1f9fb66c356b08e166c0b8a02564c1d602f Mon Sep 17 00:00:00 2001 From: Brian Geuther Date: Tue, 21 Oct 2025 11:50:44 -0400 Subject: [PATCH 1/9] Extending new metrics to summaries table too --- pyproject.toml | 2 +- src/jabs_postprocess/utils/project_utils.py | 17 ++++++++++++++++- uv.lock | 2 +- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4f1eee3..0ddc0be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "jabs-postprocess" -version = "0.4.2" +version = "0.5.0" description = "A python library for JABS postprocessing utilities." readme = "README.md" license = "LicenseRef-PLATFORM-LICENSE-AGREEMENT-FOR-NON-COMMERCIAL-USE" diff --git a/src/jabs_postprocess/utils/project_utils.py b/src/jabs_postprocess/utils/project_utils.py index 4823460..4b1ddef 100644 --- a/src/jabs_postprocess/utils/project_utils.py +++ b/src/jabs_postprocess/utils/project_utils.py @@ -802,7 +802,6 @@ def add_bout_statistics(self): - bout_duration_var: Variance of bout durations for this animal - latency_to_first_bout: Frame number of first behavior bout (if any) """ - # Group by animal and calculate statistics for behavior bouts only behavior_bouts = self._data[self._data["is_behavior"] == 1] @@ -1019,6 +1018,18 @@ def bouts_to_bins( results["bout_behavior"] = len( bins_to_summarize.loc[bins_to_summarize["is_behavior"] == 1] ) + results["avg_bout_duration"] = bins_to_summarize.loc[ + bins_to_summarize["is_behavior"] == 1, "duration" + ].mean() + results["bout_duration_std"] = bins_to_summarize.loc[ + bins_to_summarize["is_behavior"] == 1, "duration" + ].std() + results["bout_duration_var"] = bins_to_summarize.loc[ + bins_to_summarize["is_behavior"] == 1, "duration" + ].var() + results["latency_to_first_bout"] = bins_to_summarize.loc[ + bins_to_summarize["is_behavior"] == 1, "start" + ].head(1) if "distance" in bins_to_summarize.keys(): results["not_behavior_dist"] = bins_to_summarize.loc[ bins_to_summarize["is_behavior"] == 0, "calc_dist" @@ -1107,6 +1118,10 @@ def __init__(self, settings: ClassifierSettings, data: pd.DataFrame): "time", "not_behavior_dist", "behavior_dist", + "avg_bout_duration", + "bout_duration_std", + "bout_duration_var", + "latency_to_first_bout", ] self._check_fields() diff --git a/uv.lock b/uv.lock index 6837038..3d517ae 100644 --- a/uv.lock +++ b/uv.lock @@ -340,7 +340,7 @@ wheels = [ [[package]] name = "jabs-postprocess" -version = "0.4.2" +version = "0.5.0" source = { editable = "." } dependencies = [ { name = "black" }, From a4842363adba6dc7a5a7fd388d07da8b3a38cb0b Mon Sep 17 00:00:00 2001 From: Brian Geuther Date: Tue, 21 Oct 2025 13:32:43 -0400 Subject: [PATCH 2/9] Adjusting README to describe new fields --- README.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index f5d08d7..acad812 100644 --- a/README.md +++ b/README.md @@ -121,7 +121,7 @@ Lots of the functions used in generating these behavior tables were designed for There are two behavior tables generated. Both contain a header line to store parameters used while calling the script. -Some features are optional, because calculating them can be expensive. These options are noted with an asterisk (\*). While default behavior is to include them, they are not guaranteed. +Some features are optional, because calculating them can be expensive or are controlled via optional arguments. These options are noted with an asterisk (\*). While default behavior is to include them, they are not guaranteed. ## Header Data @@ -150,6 +150,11 @@ The bout table contains a compressed RLE encoded format for each bout (post-filt * `0` : Not behavior prediction * `1` : Behavior prediction * `distance`\* : Distance traveled during bout +* `total_bout_count`\* : Number of behavior bouts per animal +* `avg_bout_duration`\* : Average bout all duration per animal +* `bout_duration_std`\* : Standard deviation of all bout durations +* `bout_duration_var`\* : Variance of all bout durations +* `latency_to_first_bout`\* : Frame number of first behavior bout ## Binned Table @@ -168,6 +173,11 @@ Summaries included: * If a bout spans multiple time bins, it will be divided into both via the proportion of time * Sum of bouts across bins produces the correct total count * Note that bouts cannot span between video files +* `avg_bout_duration` : Average bout duration per animal (in time bin) +* `bout_duration_std` : Standard deviation of bout durations (in time bin) +* `bout_duration_var` : Variance of bout durations (in time bin) +* `latency_to_first_bout` : Frame number of first behavior bout in the time bin + * Frame is relative to the experiment start, not the time bin * `not_behavior_dist`\* : Total distance traveled during not behavior bouts * `behavior_dist`\* : Total distance traveled during behavior bouts From ca61c2c3e835cae1562d2f1bf57e0b1896b34bae Mon Sep 17 00:00:00 2001 From: Brian Geuther Date: Tue, 21 Oct 2025 13:33:07 -0400 Subject: [PATCH 3/9] Fixing the usage of bout splitting of metrics into summary table data --- src/jabs_postprocess/utils/project_utils.py | 40 ++++++++++++++------- 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/src/jabs_postprocess/utils/project_utils.py b/src/jabs_postprocess/utils/project_utils.py index 4b1ddef..a48adc1 100644 --- a/src/jabs_postprocess/utils/project_utils.py +++ b/src/jabs_postprocess/utils/project_utils.py @@ -901,7 +901,9 @@ def bouts_to_bins( Binned event data describing the event data. Notes: - Binned data describes event data as summaries. For each state, total time and distance travelled are provided. Additionally, the number of behavior events are counted. + Binned data describes event data as summaries. + For each state, total time and distance travelled are provided. + Additionally, the number of behavior events are counted. Events that span multiple bins are split between them based on the percent in each, allowing fractional bout counts. """ # Get the range that the experiment spans @@ -1015,18 +1017,32 @@ def bouts_to_bins( results["time_behavior"] = bins_to_summarize.loc[ bins_to_summarize["is_behavior"] == 1, "duration" ].sum() - results["bout_behavior"] = len( - bins_to_summarize.loc[bins_to_summarize["is_behavior"] == 1] + results["bout_behavior"] = ( + bins_to_summarize.loc[ + bins_to_summarize["is_behavior"] == 1, "percent_bout" + ] + ).sum() + results["avg_bout_duration"] = ( + results["time_behavior"] / results["bout_behavior"] ) - results["avg_bout_duration"] = bins_to_summarize.loc[ - bins_to_summarize["is_behavior"] == 1, "duration" - ].mean() - results["bout_duration_std"] = bins_to_summarize.loc[ - bins_to_summarize["is_behavior"] == 1, "duration" - ].std() - results["bout_duration_var"] = bins_to_summarize.loc[ - bins_to_summarize["is_behavior"] == 1, "duration" - ].var() + if results["bout_behavior"] > 1: + results["bout_duration_var"] = np.sum( + np.square( + ( + bins_to_summarize.loc[ + bins_to_summarize["is_behavior"] == 1, "duration" + ] + / bins_to_summarize.loc[ + bins_to_summarize["is_behavior"] == 1, "percent_bout" + ] + ) + - results["avg_bout_duration"] + ) + ) / (results["bout_behavior"] - 1) + results["bout_duration_std"] = np.sqrt(results["bout_duration_var"]) + else: + results["bout_duration_var"] = np.nan + results["bout_duration_std"] = np.nan results["latency_to_first_bout"] = bins_to_summarize.loc[ bins_to_summarize["is_behavior"] == 1, "start" ].head(1) From 5529dc170a7efe4f4e9004e8680fdfcdcf053b74 Mon Sep 17 00:00:00 2001 From: Brian Geuther Date: Tue, 21 Oct 2025 14:42:25 -0400 Subject: [PATCH 4/9] Adds latency to last prediction. Changes first "bout" to "prediction" for clarity --- README.md | 3 ++- src/jabs_postprocess/utils/project_utils.py | 21 +++++++++++++++++---- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index acad812..011c278 100644 --- a/README.md +++ b/README.md @@ -176,8 +176,9 @@ Summaries included: * `avg_bout_duration` : Average bout duration per animal (in time bin) * `bout_duration_std` : Standard deviation of bout durations (in time bin) * `bout_duration_var` : Variance of bout durations (in time bin) -* `latency_to_first_bout` : Frame number of first behavior bout in the time bin +* `latency_to_first_prediction` : Frame number of first behavior prediction in the time bin * Frame is relative to the experiment start, not the time bin +* `latency_to_last_prediction` : Frame number of last behavior prediction in the time bin * `not_behavior_dist`\* : Total distance traveled during not behavior bouts * `behavior_dist`\* : Total distance traveled during behavior bouts diff --git a/src/jabs_postprocess/utils/project_utils.py b/src/jabs_postprocess/utils/project_utils.py index a48adc1..36157d4 100644 --- a/src/jabs_postprocess/utils/project_utils.py +++ b/src/jabs_postprocess/utils/project_utils.py @@ -1043,9 +1043,21 @@ def bouts_to_bins( else: results["bout_duration_var"] = np.nan results["bout_duration_std"] = np.nan - results["latency_to_first_bout"] = bins_to_summarize.loc[ - bins_to_summarize["is_behavior"] == 1, "start" - ].head(1) + results["latency_to_first_prediction"] = ( + bins_to_summarize.loc[bins_to_summarize["is_behavior"] == 1, "start"] + .head(1) + .values + ) + results["latency_to_last_prediction"] = ( + bins_to_summarize.loc[bins_to_summarize["is_behavior"] == 1, "start"] + .tail(1) + .values + + bins_to_summarize.loc[ + bins_to_summarize["is_behavior"] == 1, "duration" + ] + .tail(1) + .values + ) if "distance" in bins_to_summarize.keys(): results["not_behavior_dist"] = bins_to_summarize.loc[ bins_to_summarize["is_behavior"] == 0, "calc_dist" @@ -1137,7 +1149,8 @@ def __init__(self, settings: ClassifierSettings, data: pd.DataFrame): "avg_bout_duration", "bout_duration_std", "bout_duration_var", - "latency_to_first_bout", + "latency_to_first_prediction", + "latency_to_last_prediction", ] self._check_fields() From 69ebed87e7a05aac0e383596a4e28c6757f91cd2 Mon Sep 17 00:00:00 2001 From: Brian Geuther Date: Tue, 21 Oct 2025 15:02:19 -0400 Subject: [PATCH 5/9] Switching to a weighted mean/var/std calculation, simplifying some of the indexing reading --- src/jabs_postprocess/utils/project_utils.py | 61 ++++++++++----------- 1 file changed, 30 insertions(+), 31 deletions(-) diff --git a/src/jabs_postprocess/utils/project_utils.py b/src/jabs_postprocess/utils/project_utils.py index 36157d4..a5aa71f 100644 --- a/src/jabs_postprocess/utils/project_utils.py +++ b/src/jabs_postprocess/utils/project_utils.py @@ -1014,49 +1014,48 @@ def bouts_to_bins( results["time_not_behavior"] = bins_to_summarize.loc[ bins_to_summarize["is_behavior"] == 0, "duration" ].sum() - results["time_behavior"] = bins_to_summarize.loc[ - bins_to_summarize["is_behavior"] == 1, "duration" - ].sum() - results["bout_behavior"] = ( - bins_to_summarize.loc[ - bins_to_summarize["is_behavior"] == 1, "percent_bout" - ] - ).sum() + + # Lots of "behavior" stats are run, so separate them for convenience + behavior_bins = bins_to_summarize.loc[bins_to_summarize["is_behavior"] == 1] + + results["time_behavior"] = behavior_bins["duration"].sum() + results["bout_behavior"] = behavior_bins["percent_bout"].sum() + # We use a weighted statistic definitions here + # Weights are the proportion of bout contained in the bin (percent_bout) results["avg_bout_duration"] = ( - results["time_behavior"] / results["bout_behavior"] + np.sum( + behavior_bins["duration"].values + * behavior_bins["percent_bout"].values + ) + / results["bout_behavior"] ) if results["bout_behavior"] > 1: - results["bout_duration_var"] = np.sum( - np.square( - ( - bins_to_summarize.loc[ - bins_to_summarize["is_behavior"] == 1, "duration" - ] - / bins_to_summarize.loc[ - bins_to_summarize["is_behavior"] == 1, "percent_bout" - ] + denom = ( + (len(behavior_bins) - 1) + * results["bout_behavior"] + / len(behavior_bins) + ) + results["bout_duration_var"] = ( + np.sum( + behavior_bins["percent_bout"].values + * np.square( + behavior_bins["duration"].values + / behavior_bins["percent_bout"].values + - results["avg_bout_duration"] ) - - results["avg_bout_duration"] ) - ) / (results["bout_behavior"] - 1) + / denom + ) results["bout_duration_std"] = np.sqrt(results["bout_duration_var"]) else: results["bout_duration_var"] = np.nan results["bout_duration_std"] = np.nan results["latency_to_first_prediction"] = ( - bins_to_summarize.loc[bins_to_summarize["is_behavior"] == 1, "start"] - .head(1) - .values + behavior_bins["start"].head(1).values ) results["latency_to_last_prediction"] = ( - bins_to_summarize.loc[bins_to_summarize["is_behavior"] == 1, "start"] - .tail(1) - .values - + bins_to_summarize.loc[ - bins_to_summarize["is_behavior"] == 1, "duration" - ] - .tail(1) - .values + behavior_bins["start"].tail(1).values + + behavior_bins["duration"].tail(1).values ) if "distance" in bins_to_summarize.keys(): results["not_behavior_dist"] = bins_to_summarize.loc[ From f432981ba26201f89945b03b1c9760cae1affdd0 Mon Sep 17 00:00:00 2001 From: Alexander Berger Date: Tue, 21 Oct 2025 16:18:33 -0400 Subject: [PATCH 6/9] Adding conditional for bount_behavior > 0 --- src/jabs_postprocess/utils/project_utils.py | 65 ++++++++++++--------- 1 file changed, 36 insertions(+), 29 deletions(-) diff --git a/src/jabs_postprocess/utils/project_utils.py b/src/jabs_postprocess/utils/project_utils.py index a5aa71f..199f34b 100644 --- a/src/jabs_postprocess/utils/project_utils.py +++ b/src/jabs_postprocess/utils/project_utils.py @@ -1022,41 +1022,48 @@ def bouts_to_bins( results["bout_behavior"] = behavior_bins["percent_bout"].sum() # We use a weighted statistic definitions here # Weights are the proportion of bout contained in the bin (percent_bout) - results["avg_bout_duration"] = ( - np.sum( - behavior_bins["duration"].values - * behavior_bins["percent_bout"].values - ) - / results["bout_behavior"] - ) - if results["bout_behavior"] > 1: - denom = ( - (len(behavior_bins) - 1) - * results["bout_behavior"] - / len(behavior_bins) - ) - results["bout_duration_var"] = ( + if results["bout_behavior"] > 0: + results["avg_bout_duration"] = ( np.sum( - behavior_bins["percent_bout"].values - * np.square( - behavior_bins["duration"].values - / behavior_bins["percent_bout"].values - - results["avg_bout_duration"] - ) + behavior_bins["duration"].values + * behavior_bins["percent_bout"].values ) - / denom + / results["bout_behavior"] ) - results["bout_duration_std"] = np.sqrt(results["bout_duration_var"]) + results["latency_to_first_prediction"] = behavior_bins["start"].min() + results["latency_to_last_prediction"] = ( + behavior_bins["start"] + behavior_bins["duration"] + ).max() + + # Variance requires more than one effective bout + if results["bout_behavior"] > 1: + denom = ( + (len(behavior_bins) - 1) + * results["bout_behavior"] + / len(behavior_bins) + ) + results["bout_duration_var"] = ( + np.sum( + behavior_bins["percent_bout"].values + * np.square( + behavior_bins["duration"].values + / behavior_bins["percent_bout"].values + - results["avg_bout_duration"] + ) + ) + / denom + ) + results["bout_duration_std"] = np.sqrt(results["bout_duration_var"]) + else: + results["bout_duration_var"] = np.nan + results["bout_duration_std"] = np.nan else: + # No behavior data - set all defaults + results["avg_bout_duration"] = np.nan results["bout_duration_var"] = np.nan results["bout_duration_std"] = np.nan - results["latency_to_first_prediction"] = ( - behavior_bins["start"].head(1).values - ) - results["latency_to_last_prediction"] = ( - behavior_bins["start"].tail(1).values - + behavior_bins["duration"].tail(1).values - ) + results["latency_to_first_prediction"] = np.nan + results["latency_to_last_prediction"] = np.nan if "distance" in bins_to_summarize.keys(): results["not_behavior_dist"] = bins_to_summarize.loc[ bins_to_summarize["is_behavior"] == 0, "calc_dist" From 23b7a747bdc356cfa838e3ed3d59f7594ea153b5 Mon Sep 17 00:00:00 2001 From: Alexander Berger Date: Tue, 21 Oct 2025 16:43:59 -0400 Subject: [PATCH 7/9] Add tests additional statistics on summary tables --- tests/utils/test_project_utils.py | 295 +++++++++++++++++++++++++++++- 1 file changed, 294 insertions(+), 1 deletion(-) diff --git a/tests/utils/test_project_utils.py b/tests/utils/test_project_utils.py index 7e8a3bf..70ecfd9 100644 --- a/tests/utils/test_project_utils.py +++ b/tests/utils/test_project_utils.py @@ -444,7 +444,7 @@ def test_add_bout_statistics_preserves_existing_columns(self): def test_add_bout_statistics_variance_calculation(self): """Test that variance and standard deviation are calculated correctly.""" - # Arrange - Create data with known variance + # Arrange: Create data with known variance data = pd.DataFrame( { "animal_idx": [0, 0, 0], @@ -515,3 +515,296 @@ def test_add_bout_statistics_empty_dataframe(self): ) # No rows to check values, but shouldn't crash + + +class TestBoutTableBoutsToBinsWeightedStats: + """Test suite for BoutTable.bouts_to_bins method focusing on weighted statistics.""" + + def test_bouts_to_bins_weighted_bout_count_with_split_bouts(self): + """Test that bout_behavior correctly uses weighted percent_bout sum. + + When bouts span multiple bins, they should be counted fractionally + based on how much of each bout falls within each bin. + """ + # Arrange: Create a simple scenario with one bout split across bins + # Bin 1: 0-1800 frames (1 minute at 30fps) + # Bin 2: 1800-3600 frames + # Bout: starts at frame 1500, duration 600 (spans both bins) + data = pd.DataFrame( + { + "animal_idx": [0], + "video_name": ["vid1"], + "start": [1500], + "duration": [600], # 300 frames in bin 1, 300 frames in bin 2 + "is_behavior": [1], + "exp_prefix": ["exp1"], + "time": ["1970-01-01 00:00:00"], + } + ) + + # Act + result = BoutTable.bouts_to_bins(data, bin_size_minutes=1, fps=30) + + # Assert + # The first bin should have 300/600 = 0.5 of the bout + # The second bin should have 300/600 = 0.5 of the bout + assert len(result) == 2, "Should have 2 bins" + assert abs(result.iloc[0]["bout_behavior"] - 0.5) < 0.01, ( + "First bin should count 0.5 bouts" + ) + assert abs(result.iloc[1]["bout_behavior"] - 0.5) < 0.01, ( + "Second bin should count 0.5 bouts" + ) + + def test_bouts_to_bins_weighted_avg_bout_duration(self): + """Test that avg_bout_duration is calculated using weighted statistics. + + The weighted mean should account for partial bouts in each bin. + """ + # Arrange: Create scenario with multiple bouts of different sizes + # Bin: 0-1800 frames + # Bout 1: 100 frames at start 100 + # Bout 2: 200 frames at start 500 + data = pd.DataFrame( + { + "animal_idx": [0, 0], + "video_name": ["vid1", "vid1"], + "start": [100, 500], + "duration": [100, 200], + "is_behavior": [1, 1], + "exp_prefix": ["exp1", "exp1"], + "time": ["1970-01-01 00:00:00", "1970-01-01 00:00:00"], + } + ) + + # Act + result = BoutTable.bouts_to_bins(data, bin_size_minutes=1, fps=30) + + # Assert + # Both bouts are fully in the first bin (percent_bout = 1.0 for each) + # Weighted mean = (100*1.0 + 200*1.0) / (1.0 + 1.0) = 300/2 = 150 + assert len(result) == 1, "Should have 1 bin" + expected_avg = 150.0 + assert abs(result.iloc[0]["avg_bout_duration"] - expected_avg) < 0.1, ( + f"Expected avg_bout_duration to be {expected_avg}" + ) + + def test_bouts_to_bins_weighted_variance_and_std(self): + """Test that variance and std are calculated correctly with weighting.""" + # Arrange: Create data with known variance + # Three bouts fully in one bin: durations 10, 20, 30 + data = pd.DataFrame( + { + "animal_idx": [0, 0, 0], + "video_name": ["vid1"] * 3, + "start": [100, 500, 900], + "duration": [10, 20, 30], + "is_behavior": [1, 1, 1], + "exp_prefix": ["exp1"] * 3, + "time": ["1970-01-01 00:00:00"] * 3, + } + ) + + # Act + result = BoutTable.bouts_to_bins(data, bin_size_minutes=1, fps=30) + + # Assert + # All bouts fully in bin, percent_bout = 1.0 for all + # Weighted mean = (10*1 + 20*1 + 30*1) / 3 = 20 + # Weighted variance formula used in code: + # sum(percent_bout * (duration/percent_bout - mean)^2) / denom + # where denom = (n-1) * sum(percent_bout) / n = 2 * 3 / 3 = 2 + # = (1*(10-20)^2 + 1*(20-20)^2 + 1*(30-20)^2) / 2 + # = (100 + 0 + 100) / 2 = 100 + expected_var = 100.0 + expected_std = 10.0 + + assert abs(result.iloc[0]["avg_bout_duration"] - 20.0) < 0.1 + assert abs(result.iloc[0]["bout_duration_var"] - expected_var) < 0.1, ( + f"Expected variance to be {expected_var}" + ) + assert abs(result.iloc[0]["bout_duration_std"] - expected_std) < 0.1, ( + f"Expected std to be {expected_std}" + ) + + def test_bouts_to_bins_variance_with_single_bout(self): + """Test that variance and std are NaN when only one bout exists.""" + # Arrange: Single bout + data = pd.DataFrame( + { + "animal_idx": [0], + "video_name": ["vid1"], + "start": [100], + "duration": [100], + "is_behavior": [1], + "exp_prefix": ["exp1"], + "time": ["1970-01-01 00:00:00"], + } + ) + + # Act + result = BoutTable.bouts_to_bins(data, bin_size_minutes=1, fps=30) + + # Assert + # With only 1 bout, bout_behavior = 1, which triggers the else branch + assert result.iloc[0]["bout_behavior"] == 1.0 + assert pd.isna(result.iloc[0]["bout_duration_var"]), ( + "Variance should be NaN for single bout" + ) + assert pd.isna(result.iloc[0]["bout_duration_std"]), ( + "Std should be NaN for single bout" + ) + + def test_bouts_to_bins_latency_to_first_prediction(self): + """Test that latency_to_first_prediction captures the start of the first bout.""" + # Arrange: Multiple bouts in one bin + data = pd.DataFrame( + { + "animal_idx": [0, 0, 0], + "video_name": ["vid1"] * 3, + "start": [500, 200, 800], # Unordered on purpose + "duration": [50, 30, 40], + "is_behavior": [1, 1, 1], + "exp_prefix": ["exp1"] * 3, + "time": ["1970-01-01 00:00:00"] * 3, + } + ) + + # Act + result = BoutTable.bouts_to_bins(data, bin_size_minutes=1, fps=30) + + # Assert + # First prediction should be at frame 200 (earliest start) + assert result.iloc[0]["latency_to_first_prediction"] == 200, ( + "Expected first prediction at frame 200" + ) + + def test_bouts_to_bins_latency_to_last_prediction(self): + """Test that latency_to_last_prediction captures the end of the last bout.""" + # Arrange: Multiple bouts in one bin + data = pd.DataFrame( + { + "animal_idx": [0, 0, 0], + "video_name": ["vid1"] * 3, + "start": [200, 500, 800], + "duration": [30, 50, 40], # Last bout ends at 800+40=840 + "is_behavior": [1, 1, 1], + "exp_prefix": ["exp1"] * 3, + "time": ["1970-01-01 00:00:00"] * 3, + } + ) + + # Act + result = BoutTable.bouts_to_bins(data, bin_size_minutes=1, fps=30) + + # Assert + # Last prediction should end at frame 840 (800 + 40) + assert result.iloc[0]["latency_to_last_prediction"] == 840, ( + "Expected last prediction to end at frame 840" + ) + + def test_bouts_to_bins_no_behavior_bouts(self): + """Test handling when there are no behavior bouts in a bin.""" + # Arrange: Only non-behavior events + data = pd.DataFrame( + { + "animal_idx": [0, 0], + "video_name": ["vid1"] * 2, + "start": [100, 500], + "duration": [200, 300], + "is_behavior": [0, -1], # No behavior + "exp_prefix": ["exp1"] * 2, + "time": ["1970-01-01 00:00:00"] * 2, + } + ) + + # Act + result = BoutTable.bouts_to_bins(data, bin_size_minutes=1, fps=30) + + # Assert + assert result.iloc[0]["bout_behavior"] == 0 + assert result.iloc[0]["time_behavior"] == 0 + # avg_bout_duration with 0 bouts: division by zero should give inf or nan + # The actual behavior depends on numpy settings, but we expect either + assert pd.isna(result.iloc[0]["avg_bout_duration"]) or np.isinf( + result.iloc[0]["avg_bout_duration"] + ) + + def test_bouts_to_bins_split_bout_weighted_statistics(self): + """Test weighted statistics when a single bout is split across bins. + + This is a critical test for the weighted statistics - a bout split + 50/50 across two bins should contribute 0.5 to each bin's statistics. + """ + # Arrange: One long bout split exactly in half across 2 bins + # Bin 1: frames 0-1800 (1 min at 30fps) + # Bin 2: frames 1800-3600 + # Bout: frames 1500-2100 (duration 600, split 300/300) + data = pd.DataFrame( + { + "animal_idx": [0], + "video_name": ["vid1"], + "start": [1500], + "duration": [600], + "is_behavior": [1], + "exp_prefix": ["exp1"], + "time": ["1970-01-01 00:00:00"], + } + ) + + # Act + result = BoutTable.bouts_to_bins(data, bin_size_minutes=1, fps=30) + + # Assert + # Both bins should have: + # - bout_behavior = 0.5 (half the bout counted) + # - time_behavior = 300 (half the frames) + # - avg_bout_duration = 300 (the actual duration in each bin) + # - variance = NaN (only one bout, even if split) + assert len(result) == 2 + + for i in range(2): + assert abs(result.iloc[i]["bout_behavior"] - 0.5) < 0.01 + assert result.iloc[i]["time_behavior"] == 300 + # When a bout is split, the avg_bout_duration is the fractional duration + assert abs(result.iloc[i]["avg_bout_duration"] - 300) < 0.1 + assert pd.isna(result.iloc[i]["bout_duration_var"]) + + def test_bouts_to_bins_multiple_split_bouts_variance(self): + """Test variance calculation with multiple bouts that span bins.""" + # Arrange: Two bouts with different durations, both split across bins + # Bin boundary at 1800 frames + # Bout 1: 1600-2000 (400 frames, 200 in each bin) + # Bout 2: 1700-2100 (400 frames, 100 in bin 1, 300 in bin 2) + data = pd.DataFrame( + { + "animal_idx": [0, 0], + "video_name": ["vid1", "vid1"], + "start": [1600, 1700], + "duration": [400, 400], + "is_behavior": [1, 1], + "exp_prefix": ["exp1", "exp1"], + "time": ["1970-01-01 00:00:00", "1970-01-01 00:00:00"], + } + ) + + # Act + result = BoutTable.bouts_to_bins(data, bin_size_minutes=1, fps=30) + + # Assert + # Bin 1: 0.5 + 0.25 = 0.75 bouts (< 1, so variance should be NaN) + # Bin 2: 0.5 + 0.75 = 1.25 bouts (> 1, so variance should be calculated) + assert len(result) == 2 + + # Bin 0 (first bin) has bout_behavior = 0.75 (< 1), so variance is NaN + assert abs(result.iloc[0]["bout_behavior"] - 0.75) < 0.01 + assert pd.isna(result.iloc[0]["bout_duration_var"]) + assert pd.isna(result.iloc[0]["bout_duration_std"]) + + # Bin 1 (second bin) has bout_behavior = 1.25 (> 1), so variance is calculated + assert abs(result.iloc[1]["bout_behavior"] - 1.25) < 0.01 + assert not pd.isna(result.iloc[1]["bout_duration_var"]) + assert not pd.isna(result.iloc[1]["bout_duration_std"]) + assert result.iloc[1]["bout_duration_std"] == np.sqrt( + result.iloc[1]["bout_duration_var"] + ) From a28e0a135062146be6eededfd53652454bf0b38d Mon Sep 17 00:00:00 2001 From: Brian Geuther Date: Wed, 22 Oct 2025 11:21:38 -0400 Subject: [PATCH 8/9] Adjusting variance calculation to nan on sample count, not percent bouts --- src/jabs_postprocess/utils/project_utils.py | 2 +- tests/utils/test_project_utils.py | 15 +++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/jabs_postprocess/utils/project_utils.py b/src/jabs_postprocess/utils/project_utils.py index 199f34b..858725b 100644 --- a/src/jabs_postprocess/utils/project_utils.py +++ b/src/jabs_postprocess/utils/project_utils.py @@ -1036,7 +1036,7 @@ def bouts_to_bins( ).max() # Variance requires more than one effective bout - if results["bout_behavior"] > 1: + if len(behavior_bins) > 1: denom = ( (len(behavior_bins) - 1) * results["bout_behavior"] diff --git a/tests/utils/test_project_utils.py b/tests/utils/test_project_utils.py index 70ecfd9..9009370 100644 --- a/tests/utils/test_project_utils.py +++ b/tests/utils/test_project_utils.py @@ -792,16 +792,19 @@ def test_bouts_to_bins_multiple_split_bouts_variance(self): result = BoutTable.bouts_to_bins(data, bin_size_minutes=1, fps=30) # Assert - # Bin 1: 0.5 + 0.25 = 0.75 bouts (< 1, so variance should be NaN) - # Bin 2: 0.5 + 0.75 = 1.25 bouts (> 1, so variance should be calculated) + # Bin 1: 0.5 and 0.25 = 2 bout samples (so variance should be calculated) + # Bin 2: 0.5 and 0.75 = 2 bout samples (so variance should be calculated) assert len(result) == 2 - # Bin 0 (first bin) has bout_behavior = 0.75 (< 1), so variance is NaN + # Bin 0 (first bin) has 2 bout samples, so variance is calculated assert abs(result.iloc[0]["bout_behavior"] - 0.75) < 0.01 - assert pd.isna(result.iloc[0]["bout_duration_var"]) - assert pd.isna(result.iloc[0]["bout_duration_std"]) + assert not pd.isna(result.iloc[0]["bout_duration_var"]) + assert not pd.isna(result.iloc[0]["bout_duration_std"]) + assert result.iloc[1]["bout_duration_std"] == np.sqrt( + result.iloc[1]["bout_duration_var"] + ) - # Bin 1 (second bin) has bout_behavior = 1.25 (> 1), so variance is calculated + # Bin 1 (second bin) has 2 bout samples, so variance is calculated assert abs(result.iloc[1]["bout_behavior"] - 1.25) < 0.01 assert not pd.isna(result.iloc[1]["bout_duration_var"]) assert not pd.isna(result.iloc[1]["bout_duration_std"]) From 61892aa56fac28382ad8cd4950f987b6ef3c29a3 Mon Sep 17 00:00:00 2001 From: Brian Geuther Date: Wed, 22 Oct 2025 12:57:49 -0400 Subject: [PATCH 9/9] Adding sample count to table --- README.md | 1 + src/jabs_postprocess/utils/project_utils.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/README.md b/README.md index 011c278..b4fd98c 100644 --- a/README.md +++ b/README.md @@ -173,6 +173,7 @@ Summaries included: * If a bout spans multiple time bins, it will be divided into both via the proportion of time * Sum of bouts across bins produces the correct total count * Note that bouts cannot span between video files +* `_stats_sample_count` : Sample count used in stats calculation (count of whole and partial bouts in time bin) * `avg_bout_duration` : Average bout duration per animal (in time bin) * `bout_duration_std` : Standard deviation of bout durations (in time bin) * `bout_duration_var` : Variance of bout durations (in time bin) diff --git a/src/jabs_postprocess/utils/project_utils.py b/src/jabs_postprocess/utils/project_utils.py index 858725b..e7b77d2 100644 --- a/src/jabs_postprocess/utils/project_utils.py +++ b/src/jabs_postprocess/utils/project_utils.py @@ -1020,6 +1020,7 @@ def bouts_to_bins( results["time_behavior"] = behavior_bins["duration"].sum() results["bout_behavior"] = behavior_bins["percent_bout"].sum() + results["_stats_sample_count"] = len(behavior_bins) # We use a weighted statistic definitions here # Weights are the proportion of bout contained in the bin (percent_bout) if results["bout_behavior"] > 0: @@ -1153,6 +1154,7 @@ def __init__(self, settings: ClassifierSettings, data: pd.DataFrame): "not_behavior_dist", "behavior_dist", "avg_bout_duration", + "_stats_sample_count", "bout_duration_std", "bout_duration_var", "latency_to_first_prediction",