Skip to content
Open
13 changes: 10 additions & 3 deletions datafusion/common/src/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -632,7 +632,14 @@ impl Statistics {
col_stats.max_value = col_stats.max_value.max(&item_col_stats.max_value);
col_stats.min_value = col_stats.min_value.min(&item_col_stats.min_value);
col_stats.sum_value = col_stats.sum_value.add(&item_col_stats.sum_value);
col_stats.distinct_count = Precision::Absent;
// Use max as a conservative lower bound for distinct count
// (can't accurately merge NDV since duplicates may exist across partitions)
col_stats.distinct_count = col_stats
.distinct_count
.get_value()
.max(item_col_stats.distinct_count.get_value())
.map(|&v| Precision::Inexact(v))
.unwrap_or(Precision::Absent);
col_stats.byte_size = col_stats.byte_size.add(&item_col_stats.byte_size);
}

Expand Down Expand Up @@ -1352,8 +1359,8 @@ mod tests {
col_stats.max_value,
Precision::Exact(ScalarValue::Int32(Some(20)))
);
// Distinct count should be Absent after merge
assert_eq!(col_stats.distinct_count, Precision::Absent);
// Distinct count should be Inexact(max) after merge as a conservative lower bound
assert_eq!(col_stats.distinct_count, Precision::Inexact(7));
}

#[test]
Expand Down
12 changes: 7 additions & 5 deletions datafusion/core/tests/physical_optimizer/partition_statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -150,13 +150,15 @@ mod test {
// - null_count = 0 (partition values from paths are never null)
// - min/max are the merged partition values across files in the group
// - byte_size = num_rows * 4 (Date32 is 4 bytes per row)
// - distinct_count = Inexact(1) per partition file (single partition value per file),
// preserved via max() when merging stats across partitions
let date32_byte_size = num_rows * 4;
column_stats.push(ColumnStatistics {
null_count: Precision::Exact(0),
max_value: Precision::Exact(ScalarValue::Date32(Some(max_date))),
min_value: Precision::Exact(ScalarValue::Date32(Some(min_date))),
sum_value: Precision::Absent,
distinct_count: Precision::Absent,
distinct_count: Precision::Inexact(1),
byte_size: Precision::Exact(date32_byte_size),
});
}
Expand Down Expand Up @@ -577,7 +579,7 @@ mod test {
max_value: Precision::Exact(ScalarValue::Date32(Some(20151))),
min_value: Precision::Exact(ScalarValue::Date32(Some(20148))),
sum_value: Precision::Absent,
distinct_count: Precision::Absent,
distinct_count: Precision::Inexact(1),
byte_size: Precision::Absent,
},
// column 2: right.id (Int32, file column from t2) - right partition 0: ids [3,4]
Expand Down Expand Up @@ -611,7 +613,7 @@ mod test {
max_value: Precision::Exact(ScalarValue::Date32(Some(20151))),
min_value: Precision::Exact(ScalarValue::Date32(Some(20148))),
sum_value: Precision::Absent,
distinct_count: Precision::Absent,
distinct_count: Precision::Inexact(1),
byte_size: Precision::Absent,
},
// column 2: right.id (Int32, file column from t2) - right partition 1: ids [1,2]
Expand Down Expand Up @@ -1247,7 +1249,7 @@ mod test {
DATE_2025_03_01,
))),
sum_value: Precision::Absent,
distinct_count: Precision::Absent,
distinct_count: Precision::Inexact(1),
byte_size: Precision::Exact(8),
},
ColumnStatistics::new_unknown(), // window column
Expand Down Expand Up @@ -1275,7 +1277,7 @@ mod test {
DATE_2025_03_03,
))),
sum_value: Precision::Absent,
distinct_count: Precision::Absent,
distinct_count: Precision::Inexact(1),
byte_size: Precision::Exact(8),
},
ColumnStatistics::new_unknown(), // window column
Expand Down
Loading