Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
211 changes: 208 additions & 3 deletions vortex-array/src/arrays/struct_/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

use std::fmt::Debug;
use std::iter::once;
use std::ops::Not;
use std::sync::Arc;

use vortex_dtype::DType;
Expand All @@ -18,10 +19,23 @@ use vortex_error::vortex_err;
use crate::Array;
use crate::ArrayRef;
use crate::IntoArray;
use crate::builtins::ArrayBuiltins;
use crate::compute::mask;
use crate::stats::ArrayStats;
use crate::validity::Validity;
use crate::vtable::ValidityHelper;

/// Metadata for StructArray serialization.
#[derive(Clone, prost::Message)]
pub struct StructMetadata {
/// If true, child validity is a superset of struct validity (validity was pushed down).
/// For nullable children, their validity already includes struct nulls. For non-nullable
/// children, we apply struct validity on field read. If false (default), no guarantee
/// about relationship - must intersect validities on read.
#[prost(bool, tag = "1")]
pub(super) validity_pushed_down: bool,
}

/// A struct array that stores multiple named fields as columns, similar to a database row.
///
/// This mirrors the Apache Arrow Struct array encoding and provides a columnar representation
Expand Down Expand Up @@ -147,6 +161,9 @@ pub struct StructArray {
pub(super) fields: Arc<[ArrayRef]>,
pub(super) validity: Validity,
pub(super) stats_set: ArrayStats,
/// true = child validity is a superset of struct validity (validity was pushed down)
/// false = default, no guarantee about relationship
pub(super) validity_pushed_down: bool,
}

pub struct StructArrayParts {
Expand All @@ -157,12 +174,74 @@ pub struct StructArrayParts {
}

impl StructArray {
/// Return the struct fields without the validity of the struct applied
/// Note this field may not have the validity of the parent struct applied.
/// Should use `masked_fields` instead, unless you know what you are doing.
pub fn unmasked_fields(&self) -> &Arc<[ArrayRef]> {
&self.fields
}

/// Return the struct field without the validity of the struct applied
pub fn masked_fields(&self) -> VortexResult<Vec<ArrayRef>> {
if !self.dtype.is_nullable() {
// fields need not be masked
return Ok(self.fields.to_vec());
}

if self.has_validity_pushed_down() {
self.fields
.iter()
.cloned()
.map(|f| {
if f.dtype().is_nullable() {
Ok(f.into_array())
} else {
let validity = self.validity().to_array(self.len);
f.mask(validity)
}
})
.collect::<VortexResult<Vec<_>>>()
} else {
// Apply struct validity to all fields
let struct_validity = self.validity().to_array(self.len);
self.fields
.iter()
.map(move |f| f.clone().mask(struct_validity.clone()))
.collect::<VortexResult<Vec<_>>>()
}
}

pub fn field_by_name(&self, name: impl AsRef<str>) -> VortexResult<ArrayRef> {
let name = name.as_ref();
self.field_by_name_opt(name)?.ok_or_else(|| {
vortex_err!(
"Field {name} not found in struct array with names {:?}",
self.names()
)
})
}

pub fn field_by_name_opt(&self, name: impl AsRef<str>) -> VortexResult<Option<ArrayRef>> {
let name = name.as_ref();
self.struct_fields()
.find(name)
.map(|idx| {
let field = self.fields[idx].clone();
// Non-nullable struct: return field as-is (no struct validity to apply)
if !self.dtype.is_nullable() {
return Ok(field);
}
// Non-nullable field: always apply struct validity (even with validity_pushed_down,
// since we can't push validity to non-nullable fields)
if !field.dtype().is_nullable() {
return field.mask(self.validity().to_array(self.len));
}
// Nullable field: return as-is (validity is either in the field or was pushed down)
Ok(field)
})
.transpose()
}

/// Note this field may not have the validity of the parent struct applied.
/// Should use `field_by_name` instead.
pub fn unmasked_field_by_name(&self, name: impl AsRef<str>) -> VortexResult<&ArrayRef> {
let name = name.as_ref();
self.unmasked_field_by_name_opt(name).ok_or_else(|| {
Expand All @@ -173,7 +252,8 @@ impl StructArray {
})
}

/// Return the struct field without the validity of the struct applied
/// Note this field may not have the validity of the parent struct applied.
/// Should use `field_by_name_opt` instead.
pub fn unmasked_field_by_name_opt(&self, name: impl AsRef<str>) -> Option<&ArrayRef> {
let name = name.as_ref();
self.struct_fields().find(name).map(|idx| &self.fields[idx])
Expand Down Expand Up @@ -286,6 +366,7 @@ impl StructArray {
fields,
validity,
stats_set: Default::default(),
validity_pushed_down: false,
}
}

Expand Down Expand Up @@ -473,4 +554,128 @@ impl StructArray {

Self::try_new_with_dtype(children, new_fields, self.len, self.validity.clone())
}

/// Returns whether validity has been pushed down into children.
///
/// When true, child validity is a superset of struct validity (children include
/// the struct's nulls baked in). This is an optimization that allows readers to
/// skip combining struct+child validity when extracting fields.
pub fn has_validity_pushed_down(&self) -> bool {
#[cfg(debug_assertions)]
if self.validity_pushed_down {
self.check_validity_pushed_down()
.vortex_expect("validity_pushed_down invariant violated");
}
self.validity_pushed_down
}

/// Checks that the validity_pushed_down invariant holds.
///
/// When `validity_pushed_down` is true, for every nullable child field,
/// the child's validity must be a superset of the struct's validity.
/// That is, wherever the struct is invalid (null), the child must also be invalid.
fn check_validity_pushed_down(&self) -> VortexResult<()> {
if !self.validity_pushed_down {
return Ok(());
}

let struct_validity = self.validity_mask()?;

for (idx, field) in self.fields.iter().enumerate() {
// Only check nullable children - non-nullable children cannot have validity pushed down
if !field.dtype().is_nullable() {
continue;
}

let child_validity = field.validity_mask()?;

// Check invariant: struct_invalid => child_invalid
// Equivalently: (!struct_validity) & child_validity should be all-false
// If struct is invalid (false) but child is valid (true), that's a violation
let violation = &(!&struct_validity) & &child_validity;
if !violation.all_false() {
vortex_bail!(
"validity_pushed_down invariant violated for field {}: \
struct has nulls at positions where child is valid",
idx
);
}
}

Ok(())
}

/// Set the validity_pushed_down flag.
///
/// For non-nullable structs, this is a no-op (flag stays false) since there's no validity to
/// push down
///
/// For nullable structs, setting this to true indicates that child validity
/// is a superset of struct validity (children include struct's nulls).
pub fn with_validity_pushed_down(mut self, validity_pushed_down: bool) -> Self {
// For non-nullable structs, the flag is meaningless - keep it false
if !self.dtype.is_nullable() {
return self;
}
self.validity_pushed_down = validity_pushed_down;

#[cfg(debug_assertions)]
if validity_pushed_down {
self.check_validity_pushed_down()
.vortex_expect("validity_pushed_down invariant violated");
}

self
}

/// Push struct validity down into each child field.
///
/// For nullable structs with non-trivial validity, this applies the validity
/// mask to each **nullable** child field, making child validity a superset
/// of parent validity. Non-nullable children are left unchanged to preserve
/// their dtype.
///
/// The struct validity is **preserved** (DType never changes). The
/// `validity_pushed_down` flag indicates that nullable children already include
/// the parent's nulls, so readers can skip combining validities for those fields.
///
/// For non-nullable structs or trivial validity, this is essentially a no-op.
pub fn compact(&self) -> VortexResult<Self> {
// For non-nullable structs, nothing to push down
if !self.dtype.is_nullable() {
return Ok(self.clone());
}

// If validity is trivial (AllValid), nothing to push down
// but mark as pushed down since children trivially include parent validity
if self.validity.all_valid(self.len)? {
return Ok(self.clone().with_validity_pushed_down(true));
}

// Get the validity mask - mask() expects true = set to null, so we invert the validity
let validity_mask = self.validity_mask()?.not();

// Apply mask only to nullable children - non-nullable children cannot have their
// dtype changed, so we leave them alone
let new_fields: Vec<ArrayRef> = self
.unmasked_fields()
.iter()
.map(|field| {
if field.dtype().is_nullable() {
mask(field.as_ref(), &validity_mask)
} else {
Ok(field.clone())
}
})
.collect::<VortexResult<_>>()?;

// Create new struct with same validity but updated children
Ok(StructArray::try_new(
self.names().clone(),
new_fields,
self.len(),
self.validity.clone(), // Keep original validity
)?
.with_validity_pushed_down(true))
}
}
1 change: 1 addition & 0 deletions vortex-array/src/arrays/struct_/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
mod array;
pub use array::StructArray;
pub use array::StructArrayParts;
pub use array::StructMetadata;
mod compute;

mod vtable;
Expand Down
Loading
Loading