diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 45cd7e838121..0d5adf587bba 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -1022,6 +1022,7 @@ if(ARROW_JSON) arrow_add_object_library(ARROW_JSON extension/fixed_shape_tensor.cc extension/opaque.cc + extension/range.cc extension/tensor_internal.cc extension/variable_shape_tensor.cc json/options.cc diff --git a/cpp/src/arrow/extension/CMakeLists.txt b/cpp/src/arrow/extension/CMakeLists.txt index ae52bc32a998..966c927bf2ce 100644 --- a/cpp/src/arrow/extension/CMakeLists.txt +++ b/cpp/src/arrow/extension/CMakeLists.txt @@ -18,7 +18,8 @@ set(CANONICAL_EXTENSION_TESTS bool8_test.cc json_test.cc uuid_test.cc) if(ARROW_JSON) - list(APPEND CANONICAL_EXTENSION_TESTS tensor_extension_array_test.cc opaque_test.cc) + list(APPEND CANONICAL_EXTENSION_TESTS tensor_extension_array_test.cc opaque_test.cc + range_test.cc) endif() add_arrow_test(test diff --git a/cpp/src/arrow/extension/meson.build b/cpp/src/arrow/extension/meson.build index 84dafe4bbe32..8be6b1321a1c 100644 --- a/cpp/src/arrow/extension/meson.build +++ b/cpp/src/arrow/extension/meson.build @@ -21,6 +21,7 @@ if needs_json canonical_extension_tests += [ 'tensor_extension_array_test.cc', 'opaque_test.cc', + 'range_test.cc', ] endif @@ -38,6 +39,7 @@ install_headers( 'json.h', 'opaque.h', 'parquet_variant.h', + 'range.h', 'uuid.h', 'variable_shape_tensor.h', ], diff --git a/cpp/src/arrow/extension/range.cc b/cpp/src/arrow/extension/range.cc new file mode 100644 index 000000000000..d39d90acfe96 --- /dev/null +++ b/cpp/src/arrow/extension/range.cc @@ -0,0 +1,205 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/extension/range.h" + +#include +#include + +#include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep +#include "arrow/util/logging_internal.h" + +#include +#include +#include + +namespace arrow::extension { + +namespace { + +/// Map RangeClosed -> the JSON string value used in serialization. +std::string_view ClosedToString(RangeClosed closed) { + switch (closed) { + case RangeClosed::Left: + return "left"; + case RangeClosed::Right: + return "right"; + case RangeClosed::Both: + return "both"; + case RangeClosed::Neither: + return "neither"; + } + // unreachable + return "right"; +} + +/// Parse the JSON "closed" string into a RangeClosed enum. +/// Returns an error if the string is not one of the four valid values. +Result ClosedFromString(std::string_view s) { + if (s == "left") return RangeClosed::Left; + if (s == "right") return RangeClosed::Right; + if (s == "both") return RangeClosed::Both; + if (s == "neither") return RangeClosed::Neither; + return Status::Invalid( + "Invalid value for RangeType \"closed\" parameter: \"", s, + "\". Expected one of: \"left\", \"right\", \"both\", \"neither\"."); +} + +/// Build the storage Struct type for a given value subtype. +std::shared_ptr MakeStorageType(const std::shared_ptr& value_type, + bool allow_unbounded) { + // Nullable bounds can represent an unbounded (infinite) endpoint; non-nullable + // bounds are always finite. + return struct_({field("lower", value_type, allow_unbounded), + field("upper", value_type, allow_unbounded)}); +} + +} // namespace + +// --------------------------------------------------------------------------- +// RangeType + +std::shared_ptr RangeType::value_type() const { + // storage_type() is a struct with two fields; both share the same type. + return internal::checked_cast(*storage_type()).field(0)->type(); +} + +std::string RangeType::ToString(bool show_metadata) const { + std::stringstream ss; + ss << "extension<" << this->extension_name() + << "[value_type=" << value_type()->ToString(show_metadata) + << ", closed=" << ClosedToString(closed_) << "]>"; + return ss.str(); +} + +bool RangeType::ExtensionEquals(const ExtensionType& other) const { + if (extension_name() != other.extension_name()) { + return false; + } + const auto& other_range = internal::checked_cast(other); + return storage_type()->Equals(*other_range.storage_type()) && + closed_ == other_range.closed_; +} + +std::string RangeType::Serialize() const { + rapidjson::Document document; + document.SetObject(); + rapidjson::Document::AllocatorType& allocator = document.GetAllocator(); + + auto closed_str = ClosedToString(closed_); + rapidjson::Value closed_value(closed_str.data(), + static_cast(closed_str.size()), + allocator); + document.AddMember(rapidjson::Value("closed", allocator), closed_value, allocator); + + rapidjson::StringBuffer buffer; + rapidjson::Writer writer(buffer); + document.Accept(writer); + return buffer.GetString(); +} + +Result> RangeType::Deserialize( + std::shared_ptr storage_type, const std::string& serialized_data) const { + // Validate storage type structure. + if (storage_type->id() != Type::STRUCT) { + return Status::Invalid("RangeType storage type must be a Struct, got ", + storage_type->ToString()); + } + const auto& struct_type = internal::checked_cast(*storage_type); + if (struct_type.num_fields() != 2) { + return Status::Invalid( + "RangeType storage Struct must have exactly 2 fields, got ", + struct_type.num_fields()); + } + const auto& lower_field = struct_type.field(0); + const auto& upper_field = struct_type.field(1); + if (lower_field->name() != "lower") { + return Status::Invalid( + "RangeType storage Struct field 0 must be named \"lower\", got \"", + lower_field->name(), "\""); + } + if (upper_field->name() != "upper") { + return Status::Invalid( + "RangeType storage Struct field 1 must be named \"upper\", got \"", + upper_field->name(), "\""); + } + if (!lower_field->type()->Equals(*upper_field->type())) { + return Status::Invalid( + "RangeType storage Struct fields \"lower\" and \"upper\" must have the same " + "type, got \"", + lower_field->type()->ToString(), "\" and \"", upper_field->type()->ToString(), + "\""); + } + + // Parse the required "closed" parameter from JSON metadata. The closedness + // is not defaulted on the wire: empty metadata or a missing key is invalid. + if (serialized_data.empty()) { + return Status::Invalid( + "RangeType metadata must be a JSON object with a required \"closed\" key, " + "got an empty string"); + } + rapidjson::Document document; + const auto& parsed = document.Parse(serialized_data.data(), serialized_data.length()); + if (parsed.HasParseError()) { + return Status::Invalid("Invalid serialized JSON data for RangeType: ", + rapidjson::GetParseError_En(parsed.GetParseError()), ": ", + serialized_data); + } + if (!document.IsObject()) { + return Status::Invalid("Invalid serialized JSON data for RangeType: not an object"); + } + if (!document.HasMember("closed")) { + return Status::Invalid( + "RangeType metadata is missing the required \"closed\" key: ", serialized_data); + } + const auto& closed_val = document["closed"]; + if (!closed_val.IsString()) { + return Status::Invalid( + "Invalid serialized JSON data for RangeType: \"closed\" is not a string"); + } + ARROW_ASSIGN_OR_RAISE( + RangeClosed closed, + ClosedFromString( + std::string_view(closed_val.GetString(), closed_val.GetStringLength()))); + + return std::make_shared(std::move(storage_type), closed); +} + +std::shared_ptr RangeType::MakeArray(std::shared_ptr data) const { + DCHECK_EQ(data->type->id(), Type::EXTENSION); + DCHECK_EQ("arrow.range", + internal::checked_cast(*data->type).extension_name()); + return std::make_shared(data); +} + +Result> RangeType::Make( + std::shared_ptr value_type, RangeClosed closed, bool allow_unbounded) { + auto storage = MakeStorageType(value_type, allow_unbounded); + return std::make_shared(std::move(storage), closed); +} + +// --------------------------------------------------------------------------- +// Free factory function + +std::shared_ptr range(std::shared_ptr value_type, + RangeClosed closed, bool allow_unbounded) { + auto result = RangeType::Make(std::move(value_type), closed, allow_unbounded); + ARROW_CHECK_OK(result.status()); + return std::move(result).ValueOrDie(); +} + +} // namespace arrow::extension diff --git a/cpp/src/arrow/extension/range.h b/cpp/src/arrow/extension/range.h new file mode 100644 index 000000000000..b9f5f55e6106 --- /dev/null +++ b/cpp/src/arrow/extension/range.h @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/extension_type.h" +#include "arrow/type.h" + +namespace arrow::extension { + +/// \brief Which bound(s) of an arrow.range interval are inclusive. +/// +/// Null (infinite) bounds are always exclusive regardless of this value. +enum class RangeClosed { + /// Lower bound is inclusive, upper bound is exclusive: [lower, upper) + Left, + /// Lower bound is exclusive, upper bound is inclusive: (lower, upper] + Right, + /// Both bounds are inclusive: [lower, upper] + Both, + /// Both bounds are exclusive: (lower, upper) + Neither, +}; + +/// \brief RangeType represents a bounded set (mathematical interval) over an +/// orderable Arrow type T. +/// +/// Storage is a Struct with exactly two fields "lower" and "upper" of the same +/// orderable type T. Each field may independently be nullable or not: a nullable +/// bound can hold null to represent an unbounded (infinite) endpoint on that +/// side, while a non-nullable bound is always finite. +/// - "lower": T (null, when nullable = unbounded below, i.e. -infinity) +/// - "upper": T (null, when nullable = unbounded above, i.e. +infinity) +/// +/// The outer struct's validity bit marks a null/absent range. +/// +/// The "closed" parameter controls which finite bounds are inclusive. +/// Null (infinite) bounds are always treated as exclusive. +class ARROW_EXPORT RangeType : public ExtensionType { + public: + /// \brief Construct a RangeType. + /// + /// \param[in] storage_type A two-field Struct type with nullable fields + /// "lower" and "upper" of the same orderable Arrow type T. + /// \param[in] closed Which bound(s) are inclusive. + explicit RangeType(std::shared_ptr storage_type, RangeClosed closed) + : ExtensionType(std::move(storage_type)), closed_(closed) {} + + std::string extension_name() const override { return "arrow.range"; } + std::string ToString(bool show_metadata = false) const override; + bool ExtensionEquals(const ExtensionType& other) const override; + std::string Serialize() const override; + Result> Deserialize( + std::shared_ptr storage_type, + const std::string& serialized_data) const override; + + /// \brief Create a RangeArray from ArrayData. + std::shared_ptr MakeArray(std::shared_ptr data) const override; + + /// \brief Factory function. + /// + /// Constructs the two-field struct storage type internally. + /// \param[in] value_type The orderable Arrow subtype T for lower and upper. + /// \param[in] closed Which bound(s) are inclusive. + /// \param[in] allow_unbounded Whether each side may be unbounded (infinite). + /// When true, the "lower" and "upper" fields are nullable and a null bound + /// denotes an infinite endpoint; when false, both bounds are non-nullable + /// and the range is always finite. Defaults to true. + static Result> Make(std::shared_ptr value_type, + RangeClosed closed = RangeClosed::Left, + bool allow_unbounded = true); + + /// \brief Return the bound-inclusivity parameter. + RangeClosed closed() const { return closed_; } + + /// \brief Return the Arrow subtype T (the type of "lower" and "upper" fields). + std::shared_ptr value_type() const; + + private: + RangeClosed closed_; +}; + +/// \brief Array class for arrow.range extension arrays. +class ARROW_EXPORT RangeArray : public ExtensionArray { + public: + using ExtensionArray::ExtensionArray; +}; + +/// \brief Create a RangeType with the given value subtype and closed parameter. +/// +/// This is a convenience wrapper around RangeType::Make that aborts on error. +/// For recoverable error handling prefer RangeType::Make. +ARROW_EXPORT std::shared_ptr range(std::shared_ptr value_type, + RangeClosed closed = RangeClosed::Left, + bool allow_unbounded = true); + +} // namespace arrow::extension diff --git a/cpp/src/arrow/extension/range_test.cc b/cpp/src/arrow/extension/range_test.cc new file mode 100644 index 000000000000..ef4463fb0c11 --- /dev/null +++ b/cpp/src/arrow/extension/range_test.cc @@ -0,0 +1,328 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "arrow/array/array_nested.h" +#include "arrow/extension/range.h" +#include "arrow/extension_type.h" +#include "arrow/io/memory.h" +#include "arrow/ipc/reader.h" +#include "arrow/ipc/writer.h" +#include "arrow/record_batch.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type.h" +#include "arrow/util/checked_cast.h" + +namespace arrow { + +using internal::checked_pointer_cast; + +// --------------------------------------------------------------------------- +// Helpers + +static std::shared_ptr RangeInt32Right() { + return checked_pointer_cast( + extension::range(int32(), extension::RangeClosed::Right)); +} + +static std::shared_ptr RangeInt32Both() { + return checked_pointer_cast( + extension::range(int32(), extension::RangeClosed::Both)); +} + +static std::shared_ptr RangeInt64Left() { + return checked_pointer_cast( + extension::range(int64(), extension::RangeClosed::Left)); +} + +// --------------------------------------------------------------------------- +// Basics + +TEST(RangeType, Basics) { + auto type = RangeInt32Right(); + ASSERT_EQ("arrow.range", type->extension_name()); + ASSERT_EQ(*int32(), *type->value_type()); + ASSERT_EQ(extension::RangeClosed::Right, type->closed()); + ASSERT_EQ(*type, *type); + ASSERT_NE(*arrow::null(), *type); + ASSERT_THAT(type->Serialize(), ::testing::Not(::testing::IsEmpty())); + ASSERT_EQ(R"({"closed":"right"})", type->Serialize()); + ASSERT_EQ("extension", + type->ToString(false)); +} + +TEST(RangeType, AllClosedValues) { + using C = extension::RangeClosed; + auto left = checked_pointer_cast( + extension::range(int32(), C::Left)); + auto right = checked_pointer_cast( + extension::range(int32(), C::Right)); + auto both = checked_pointer_cast( + extension::range(int32(), C::Both)); + auto neither = checked_pointer_cast( + extension::range(int32(), C::Neither)); + + ASSERT_EQ(R"({"closed":"left"})", left->Serialize()); + ASSERT_EQ(R"({"closed":"right"})", right->Serialize()); + ASSERT_EQ(R"({"closed":"both"})", both->Serialize()); + ASSERT_EQ(R"({"closed":"neither"})", neither->Serialize()); +} + +// --------------------------------------------------------------------------- +// Equals + +TEST(RangeType, Equals) { + auto type_i32_right = RangeInt32Right(); + auto type_i32_both = RangeInt32Both(); + auto type_i64_left = RangeInt64Left(); + auto type_i32_right2 = RangeInt32Right(); + + // Same object. + ASSERT_EQ(*type_i32_right, *type_i32_right); + + // Different instances but same parameters. + ASSERT_EQ(*type_i32_right, *type_i32_right2); + + // Different closed value. + ASSERT_NE(*type_i32_right, *type_i32_both); + + // Different value_type. + ASSERT_NE(*type_i32_right, *type_i64_left); + + // Not equal to a non-range type. + ASSERT_NE(*type_i32_right, *arrow::null()); + ASSERT_NE(*type_i32_right, *arrow::int32()); +} + +// --------------------------------------------------------------------------- +// CreateFromArray + +TEST(RangeType, CreateFromArray) { + auto type = RangeInt32Right(); + // Build a StructArray that matches the storage type. + auto storage_type = type->storage_type(); + auto lower = ArrayFromJSON(int32(), "[1, null, 5]"); + auto upper = ArrayFromJSON(int32(), "[10, 20, null]"); + ASSERT_OK_AND_ASSIGN(auto storage, StructArray::Make({lower, upper}, + {field("lower", int32(), true), + field("upper", int32(), true)})); + auto array = ExtensionType::WrapArray(type, storage); + ASSERT_EQ(3, array->length()); + ASSERT_EQ(0, array->null_count()); +} + +// --------------------------------------------------------------------------- +// Deserialize - valid cases + +namespace { + +void CheckDeserialize(const std::string& serialized, + const std::shared_ptr& expected) { + auto type = checked_pointer_cast(expected); + ASSERT_OK_AND_ASSIGN(auto deserialized, + type->Deserialize(type->storage_type(), serialized)); + ASSERT_EQ(*expected, *deserialized); +} + +} // namespace + +TEST(RangeType, Deserialize) { + // Normal JSON + ASSERT_NO_FATAL_FAILURE( + CheckDeserialize(R"({"closed": "right"})", + extension::range(int32(), extension::RangeClosed::Right))); + ASSERT_NO_FATAL_FAILURE( + CheckDeserialize(R"({"closed": "left"})", + extension::range(int32(), extension::RangeClosed::Left))); + ASSERT_NO_FATAL_FAILURE( + CheckDeserialize(R"({"closed": "both"})", + extension::range(int32(), extension::RangeClosed::Both))); + ASSERT_NO_FATAL_FAILURE( + CheckDeserialize(R"({"closed": "neither"})", + extension::range(int32(), extension::RangeClosed::Neither))); + + // Extra fields are tolerated (forward-compatibility). + ASSERT_NO_FATAL_FAILURE( + CheckDeserialize(R"({"closed": "right", "extra": 42})", + extension::range(int32(), extension::RangeClosed::Right))); +} + +TEST(RangeType, DefaultClosedIsLeft) { + // The C++ convenience default is left-closed; the wire format still always + // carries an explicit "closed". + auto type = checked_pointer_cast(extension::range(int32())); + ASSERT_EQ(extension::RangeClosed::Left, type->closed()); + ASSERT_EQ(R"({"closed":"left"})", type->Serialize()); +} + +// --------------------------------------------------------------------------- +// Deserialize - invalid cases + +TEST(RangeType, DeserializeInvalidMetadata) { + auto type = RangeInt32Right(); + + // "closed" is required on the wire: empty metadata is invalid. + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("empty string"), + type->Deserialize(type->storage_type(), "")); + + // A JSON object without the "closed" key is invalid. + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("missing the required \"closed\" key"), + type->Deserialize(type->storage_type(), "{}")); + + // Truly malformed JSON fails. + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("Missing a name for object member"), + type->Deserialize(type->storage_type(), "{")); + + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("not an object"), + type->Deserialize(type->storage_type(), "[]")); + + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("\"closed\" is not a string"), + type->Deserialize(type->storage_type(), R"({"closed": 42})")); + + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("Invalid value for RangeType"), + type->Deserialize(type->storage_type(), R"({"closed": "unknown"})")); +} + +TEST(RangeType, DeserializeInvalidStorage) { + auto type = RangeInt32Right(); + auto wrong_storage_not_struct = int32(); + + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("must be a Struct"), + type->Deserialize(wrong_storage_not_struct, R"({"closed":"right"})")); + + // Wrong number of fields. + auto one_field = struct_({field("lower", int32(), true)}); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("exactly 2 fields"), + type->Deserialize(one_field, R"({"closed":"right"})")); + + // Wrong field name for field 0. + auto bad_lower_name = + struct_({field("start", int32(), true), field("upper", int32(), true)}); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("named \"lower\""), + type->Deserialize(bad_lower_name, R"({"closed":"right"})")); + + // Wrong field name for field 1. + auto bad_upper_name = + struct_({field("lower", int32(), true), field("end", int32(), true)}); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("named \"upper\""), + type->Deserialize(bad_upper_name, R"({"closed":"right"})")); + + // Fields have different types. + auto mismatched_types = + struct_({field("lower", int32(), true), field("upper", int64(), true)}); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("same type"), + type->Deserialize(mismatched_types, R"({"closed":"right"})")); +} + +// --------------------------------------------------------------------------- +// Non-nullable / asymmetric bounds +// +// Bound nullability is only needed to represent an unbounded (infinite) +// endpoint; non-nullable bounds describe a finite-only range and are accepted. + +TEST(RangeType, NonNullableBounds) { + auto type = RangeInt32Right(); + + // Both bounds non-nullable: accepted (a finite-only range). + auto both_non_nullable = struct_( + {field("lower", int32(), /*nullable=*/false), + field("upper", int32(), /*nullable=*/false)}); + ASSERT_OK_AND_ASSIGN(auto from_non_nullable, + type->Deserialize(both_non_nullable, R"({"closed":"right"})")); + ASSERT_EQ( + *int32(), + *checked_pointer_cast(from_non_nullable)->value_type()); + + // Asymmetric: lower nullable (may be -inf), upper non-nullable (always finite). + auto asymmetric = struct_( + {field("lower", int32(), /*nullable=*/true), + field("upper", int32(), /*nullable=*/false)}); + ASSERT_OK_AND_ASSIGN(auto from_asymmetric, + type->Deserialize(asymmetric, R"({"closed":"left"})")); + ASSERT_EQ(extension::RangeClosed::Left, + checked_pointer_cast(from_asymmetric)->closed()); + + // The factory can build non-nullable bounds via allow_unbounded=false. + auto finite = checked_pointer_cast( + extension::range(int32(), extension::RangeClosed::Both, /*allow_unbounded=*/false)); + const auto& finite_storage = + internal::checked_cast(*finite->storage_type()); + ASSERT_FALSE(finite_storage.field(0)->nullable()); + ASSERT_FALSE(finite_storage.field(1)->nullable()); +} + +// --------------------------------------------------------------------------- +// Metadata (Serialize/Deserialize) round-trip + +TEST(RangeType, MetadataRoundTrip) { + using C = extension::RangeClosed; + for (const auto& type : + {extension::range(int32(), C::Left), extension::range(int32(), C::Right), + extension::range(int32(), C::Both), extension::range(int32(), C::Neither), + extension::range(int64(), C::Right), extension::range(date32(), C::Both)}) { + auto rt = checked_pointer_cast(type); + std::string serialized = rt->Serialize(); + ASSERT_OK_AND_ASSIGN(auto deserialized, + rt->Deserialize(rt->storage_type(), serialized)); + ASSERT_EQ(*type, *deserialized) << "Round-trip failed for: " << type->ToString(); + } +} + +// --------------------------------------------------------------------------- +// IPC (BatchRoundTrip) -- registration round-trip + +TEST(RangeType, BatchRoundTrip) { + auto type = RangeInt32Right(); + auto lower = ArrayFromJSON(int32(), "[1, null, 5]"); + auto upper = ArrayFromJSON(int32(), "[10, 20, null]"); + ASSERT_OK_AND_ASSIGN(auto storage, StructArray::Make({lower, upper}, + {field("lower", int32(), true), + field("upper", int32(), true)})); + auto array = ExtensionType::WrapArray(type, storage); + auto batch = + RecordBatch::Make(schema({field("rng", type)}), array->length(), {array}); + + std::shared_ptr written; + { + ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create()); + ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(), + out_stream.get())); + ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish()); + + io::BufferReader reader(complete_ipc_stream); + std::shared_ptr batch_reader; + ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader)); + ASSERT_OK(batch_reader->ReadNext(&written)); + } + + ASSERT_EQ(*batch->schema(), *written->schema()); + ASSERT_BATCHES_EQUAL(*batch, *written); +} + +} // namespace arrow diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc index ce88c9517411..1dd840621f20 100644 --- a/cpp/src/arrow/extension_type.cc +++ b/cpp/src/arrow/extension_type.cc @@ -31,6 +31,7 @@ #ifdef ARROW_JSON # include "arrow/extension/fixed_shape_tensor.h" # include "arrow/extension/opaque.h" +# include "arrow/extension/range.h" # include "arrow/extension/variable_shape_tensor.h" #endif #include "arrow/extension/json.h" @@ -156,6 +157,7 @@ static void CreateGlobalRegistry() { #ifdef ARROW_JSON ext_types.push_back(extension::fixed_shape_tensor(int64(), {})); ext_types.push_back(extension::opaque(null(), "", "")); + ext_types.push_back(extension::range(int32())); ext_types.push_back(extension::variable_shape_tensor(int64(), 0)); #endif diff --git a/cpp/src/arrow/meson.build b/cpp/src/arrow/meson.build index 4b8faebecfd7..d03a41d56966 100644 --- a/cpp/src/arrow/meson.build +++ b/cpp/src/arrow/meson.build @@ -507,6 +507,7 @@ if needs_json 'sources': [ 'extension/fixed_shape_tensor.cc', 'extension/opaque.cc', + 'extension/range.cc', 'extension/tensor_internal.cc', 'extension/variable_shape_tensor.cc', 'json/options.cc', diff --git a/docs/source/cpp/api/extension.rst b/docs/source/cpp/api/extension.rst index 5b9620907f2b..6b1e2e9a8df0 100644 --- a/docs/source/cpp/api/extension.rst +++ b/docs/source/cpp/api/extension.rst @@ -42,6 +42,10 @@ Extension Type classes :project: arrow_cpp :members: +.. doxygenclass:: arrow::extension::RangeType + :project: arrow_cpp + :members: + Extension Array classes ======================= @@ -61,3 +65,13 @@ Extension Array classes :project: arrow_cpp :members: +.. doxygenclass:: arrow::extension::RangeArray + :project: arrow_cpp + :members: + +Extension functions +=================== + +.. doxygenfunction:: arrow::extension::range + :project: arrow_cpp + diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index c6cd8f3ea13a..1cdd9a9be56f 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -573,6 +573,94 @@ This extension type is intended to be compatible with ANSI SQL's ``TIMESTAMP WIT It is also *permissible* for the ``offset_minutes`` field to be dictionary-encoded or run-end-encoded. +.. _range_extension: + +Range +===== + +Range represents a bounded set (mathematical interval) defined by a lower and +an upper bound over an orderable Arrow type T. It is the Arrow equivalent of +PostgreSQL's `range types`_ and SQL:2011 ``PERIOD`` types. + +.. note:: + + **Disambiguation from Arrow's calendar** ``Interval`` **type.** + Arrow already has an ``Interval`` type (``INTERVAL_MONTHS``, + ``INTERVAL_DAY_TIME``, ``INTERVAL_MONTH_DAY_NANO``) that represents a + *duration* -- a signed difference between two points in time. The + ``arrow.range`` extension type is an entirely different concept: it + represents a *bounded set* with explicit lower and upper endpoints, + analogous to a closed or open interval in mathematics. The naming + follows database convention: SQL uses ``INTERVAL`` for durations and + ``RANGE`` (or ``PERIOD``) for bounded sets. + +* Extension name: ``arrow.range``. + +* The storage type of the extension is a ``Struct`` with exactly **two fields, + in order**: + + * ``lower``: the lower bound, type **T**, *optionally nullable*. + When the field is nullable, a null value means the range is unbounded below + (negative infinity). + * ``upper``: the upper bound, type **T**, *optionally nullable*. + When the field is nullable, a null value means the range is unbounded above + (positive infinity). + + **T** (the *subtype* or *value type*) may be any orderable Arrow type: + integer, floating-point, decimal, date, time, or timestamp types. Both + fields share the same type T. The subtype is read directly from the + storage struct and is **not** duplicated in the extension metadata. + + Each of ``lower`` and ``upper`` **may** be nullable, independently of the + other. Nullability is **only** needed to represent an unbounded side: a + nullable bound may hold null to mean an infinite endpoint, while a + non-nullable bound is always finite. A null bound is **always treated as + exclusive**, regardless of the value of the ``closed`` parameter; positive and + negative infinity can never be included in a closed bound. A null ``lower`` + means the range extends to negative infinity, a null ``upper`` means it + extends to positive infinity, and a range whose ``lower`` and ``upper`` are + both null (and both nullable) is the universal range ``(-inf, +inf)``. The + outer struct's validity bit marks a null/absent range (a missing range, + distinct from an empty range). + +* Extension type parameters: + + * **closed** = which finite bound(s) are inclusive. Allowed values + (following pandas interval vocabulary): + + * ``"left"`` -- lower bound inclusive, upper bound exclusive: ``[lower, upper)`` + * ``"right"`` -- lower bound exclusive, upper bound inclusive: ``(lower, upper]`` + * ``"both"`` -- both bounds inclusive: ``[lower, upper]`` + * ``"neither"`` -- both bounds exclusive: ``(lower, upper)`` + + A range thus contains every value x permitted by its finite bounds and + ``closed`` setting: with ``closed="both"`` every x such that + ``lower <= x <= upper``, with ``closed="neither"`` every x such that + ``lower < x < upper``. A range is *empty* when ``lower > upper``, or when + ``lower == upper`` and at least one bound is exclusive. + +* Description of the serialization: + + The extension metadata **must** be a valid JSON object containing the + **required** key: + + * ``"closed"`` (string, **required**): one of ``"left"``, ``"right"``, + ``"both"``, or ``"neither"``. + + The closedness is **not** defaulted on the wire: an empty metadata string, + or a JSON object without a ``"closed"`` key, is invalid. This keeps the + serialized form unambiguous for consumers. Additional keys in the JSON + object should be ignored to allow forward-compatible extensions. + + Examples: + + - ``{"closed": "right"}`` -- half-open interval, right-closed + - ``{"closed": "left"}`` -- half-open interval, left-closed + - ``{"closed": "both"}`` -- closed interval + - ``{"closed": "neither"}``-- open interval + +.. _range types: https://www.postgresql.org/docs/current/rangetypes.html + Community Extension Types ========================= diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst index 290ce09befb1..5a4dcecdc567 100644 --- a/docs/source/python/api/arrays.rst +++ b/docs/source/python/api/arrays.rst @@ -101,6 +101,7 @@ may expose data type-specific methods or properties. JsonArray UuidArray Bool8Array + RangeArray .. _api.scalar: @@ -169,3 +170,4 @@ classes may expose data type-specific methods or properties. JsonScalar UuidScalar Bool8Scalar + RangeScalar diff --git a/docs/source/python/api/datatypes.rst b/docs/source/python/api/datatypes.rst index ea9e547d32c7..71ee00557f33 100644 --- a/docs/source/python/api/datatypes.rst +++ b/docs/source/python/api/datatypes.rst @@ -73,6 +73,7 @@ These should be used to create Arrow data types and schemas. sparse_union opaque bool8 + range_ uuid json_ field @@ -146,6 +147,7 @@ implemented by PyArrow. JsonType UuidType Bool8Type + RangeType .. _api.types.checking: .. currentmodule:: pyarrow.types diff --git a/docs/source/status.rst b/docs/source/status.rst index 6379741878ca..f3af1a50e0d2 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -131,6 +131,8 @@ Data Types +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Parquet Variant | | | ✓ | | | ✓ | | | +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ +| Range | ✓ | | | | | | | | ++-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ Notes: diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index adfc50d57395..1c4e82181aee 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -165,7 +165,7 @@ def print_entry(label, value): union, sparse_union, dense_union, dictionary, run_end_encoded, - bool8, fixed_shape_tensor, json_, opaque, uuid, + bool8, fixed_shape_tensor, json_, opaque, range_, uuid, field, type_for_alias, DataType, DictionaryType, StructType, @@ -177,7 +177,7 @@ def print_entry(label, value): Decimal32Type, Decimal64Type, Decimal128Type, Decimal256Type, BaseExtensionType, ExtensionType, RunEndEncodedType, Bool8Type, FixedShapeTensorType, - JsonType, OpaqueType, UuidType, + JsonType, OpaqueType, RangeType, UuidType, UnknownExtensionType, register_extension_type, unregister_extension_type, DictionaryMemo, @@ -214,7 +214,7 @@ def print_entry(label, value): Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array, StructArray, ExtensionArray, RunEndEncodedArray, Bool8Array, FixedShapeTensorArray, - JsonArray, OpaqueArray, UuidArray, + JsonArray, OpaqueArray, RangeArray, UuidArray, scalar, NA, _NULL as NULL, Scalar, NullScalar, BooleanScalar, Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar, @@ -232,7 +232,8 @@ def print_entry(label, value): FixedSizeBinaryScalar, DictionaryScalar, MapScalar, StructScalar, UnionScalar, RunEndEncodedScalar, Bool8Scalar, ExtensionScalar, - FixedShapeTensorScalar, JsonScalar, OpaqueScalar, UuidScalar) + FixedShapeTensorScalar, JsonScalar, OpaqueScalar, + RangeScalar, UuidScalar) # Buffers, allocation from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index ecdbb342d3e2..a25b774d8df0 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -4956,6 +4956,29 @@ cdef class Bool8Array(ExtensionArray): return Bool8Array.from_storage(storage_arr) +cdef class RangeArray(ExtensionArray): + """ + Concrete class for range extension arrays. + + Examples + -------- + Define the extension type for a range array + + >>> import pyarrow as pa + >>> range_type = pa.range_(pa.int32(), "both") + + Create an extension array + + >>> storage = pa.array( + ... [{"lower": 1, "upper": 5}, {"lower": None, "upper": 10}], + ... range_type.storage_type, + ... ) + >>> arr = pa.ExtensionArray.from_storage(range_type, storage) + >>> isinstance(arr, pa.RangeArray) + True + """ + + cdef dict _array_classes = { _Type_NA: NullArray, _Type_BOOL: BooleanArray, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 79522c12474b..26acf0985ab5 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -3085,6 +3085,28 @@ cdef extern from "arrow/extension/bool8.h" namespace "arrow::extension" nogil: cdef cppclass CBool8Array" arrow::extension::Bool8Array"(CExtensionArray): pass + +cdef extern from "arrow/extension/range.h" namespace "arrow::extension" nogil: + cdef enum class CRangeClosed" arrow::extension::RangeClosed": + Left + Right + Both + Neither + + cdef cppclass CRangeType" arrow::extension::RangeType"(CExtensionType): + + @staticmethod + CResult[shared_ptr[CDataType]] Make(shared_ptr[CDataType] value_type, + CRangeClosed closed, + c_bool allow_unbounded) + + CRangeClosed closed() + shared_ptr[CDataType] value_type() + + cdef cppclass CRangeArray" arrow::extension::RangeArray"(CExtensionArray): + pass + + cdef extern from "arrow/util/compression.h" namespace "arrow" nogil: cdef enum CCompressionType" arrow::Compression::type": CCompressionType_UNCOMPRESSED" arrow::Compression::UNCOMPRESSED" diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 683faa7855c5..c4195df0dfe8 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -203,6 +203,10 @@ cdef class Bool8Type(BaseExtensionType): cdef: const CBool8Type* bool8_ext_type +cdef class RangeType(BaseExtensionType): + cdef: + const CRangeType* range_ext_type + cdef class OpaqueType(BaseExtensionType): cdef: const COpaqueType* opaque_ext_type diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index d1fa1192debc..7bb56435b7e2 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -131,6 +131,8 @@ cdef api object pyarrow_wrap_data_type( out = Bool8Type.__new__(Bool8Type) elif extension_name == b"arrow.fixed_shape_tensor": out = FixedShapeTensorType.__new__(FixedShapeTensorType) + elif extension_name == b"arrow.range": + out = RangeType.__new__(RangeType) elif extension_name == b"arrow.opaque": out = OpaqueType.__new__(OpaqueType) elif extension_name == b"arrow.uuid": diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index fb7de926edc1..4b2599724283 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -1611,6 +1611,12 @@ cdef class Bool8Scalar(ExtensionScalar): py_val = super().as_py() return None if py_val is None else py_val != 0 + +cdef class RangeScalar(ExtensionScalar): + """ + Concrete class for range extension scalar. + """ + cdef dict _scalar_classes = { _Type_BOOL: BooleanScalar, _Type_UINT8: UInt8Scalar, diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 465b556876b4..3610e16b58ce 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1915,6 +1915,94 @@ def test_opaque_type(pickle_module, storage_type, storage): assert inner == storage +@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"]) +@pytest.mark.parametrize("value_type,bounds", [ + (pa.int32(), [{"lower": 1, "upper": 5}, {"lower": None, "upper": 10}]), + (pa.int64(), [{"lower": None, "upper": None}, {"lower": 2, "upper": 8}]), + (pa.float64(), [{"lower": 0.0, "upper": 1.5}, None]), +]) +def test_range_type(pickle_module, closed, value_type, bounds): + range_type = pa.range_(value_type, closed) + assert range_type.extension_name == "arrow.range" + assert range_type.value_type == value_type + assert range_type.closed == closed + assert range_type.storage_type == pa.struct([ + pa.field("lower", value_type, nullable=True), + pa.field("upper", value_type, nullable=True), + ]) + assert "arrow.range" in str(range_type) + + # the closed parameter defaults to "left" + assert pa.range_(value_type).closed == "left" + + assert range_type == range_type + assert range_type == pa.range_(value_type, closed) + assert range_type != value_type + # different closed parameter -> not equal + other_closed = "right" if closed != "right" else "left" + assert range_type != pa.range_(value_type, other_closed) + # different value type -> not equal + assert range_type != pa.range_(pa.decimal128(12, 3), closed) + + # Pickle roundtrip + result = pickle_module.loads(pickle_module.dumps(range_type)) + assert result == range_type + assert result.closed == closed + assert result.value_type == value_type + + # IPC roundtrip + range_arr_class = range_type.__arrow_ext_class__() + storage = pa.array(bounds, range_type.storage_type) + arr = pa.ExtensionArray.from_storage(range_type, storage) + assert isinstance(arr, range_arr_class) + + # extension is registered by default + buf = ipc_write_batch(pa.RecordBatch.from_arrays([arr], ["ext"])) + batch = ipc_read_batch(buf) + + assert batch.column(0).type.extension_name == "arrow.range" + assert batch.column(0).type.closed == closed + assert isinstance(batch.column(0), range_arr_class) + assert batch.column(0) == arr + + # cast storage -> extension type + result = storage.cast(range_type) + assert result == arr + + # cast extension type -> storage type + inner = arr.cast(range_type.storage_type) + assert inner == storage + + +def test_range_type_invalid_closed(): + with pytest.raises(ValueError, match="Invalid value for range"): + pa.range_(pa.int32(), "invalid") + with pytest.raises(ValueError, match="Invalid value for range"): + pa.range_(pa.int32(), "") + + +def test_range_type_allow_unbounded(): + # Default: bounds are nullable (can represent an unbounded / infinite side). + nullable = pa.range_(pa.int32(), "both") + assert nullable.storage_type.field("lower").nullable + assert nullable.storage_type.field("upper").nullable + + # allow_unbounded=False: a finite-only range with non-nullable bounds. + finite = pa.range_(pa.int32(), "both", allow_unbounded=False) + assert not finite.storage_type.field("lower").nullable + assert not finite.storage_type.field("upper").nullable + assert finite.value_type == pa.int32() + assert finite.closed == "both" + + # Distinct types: storage nullability differs. + assert finite != nullable + + # A non-nullable-bounds range round-trips through its storage. + storage = pa.array([{"lower": 1, "upper": 5}], finite.storage_type) + arr = pa.ExtensionArray.from_storage(finite, storage) + assert arr.type == finite + + def test_bool8_type(pickle_module): bool8_type = pa.bool8() storage_type = pa.int8() diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index ec1a5a2ba9a3..0777c181c502 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -2091,6 +2091,62 @@ cdef class Bool8Type(BaseExtensionType): return Bool8Scalar +cdef class RangeType(BaseExtensionType): + """ + Concrete class for range extension type. + + Range represents a bounded set (a mathematical interval) over an orderable + Arrow value type. The underlying storage is a Struct with two fields + "lower" and "upper" of the value type, each optionally nullable; when a + bound field is nullable, a null value denotes an unbounded (infinite) side. + The "closed" parameter controls which finite bounds are inclusive. + + Examples + -------- + Create an instance of range extension type: + + >>> import pyarrow as pa + >>> pa.range_(pa.int32(), "both") + RangeType(extension) + """ + + cdef void init(self, const shared_ptr[CDataType]& type) except *: + BaseExtensionType.init(self, type) + self.range_ext_type = type.get() + + @property + def value_type(self): + """ + The Arrow value type of the "lower" and "upper" bounds. + """ + return pyarrow_wrap_data_type(self.range_ext_type.value_type()) + + @property + def closed(self): + """ + Which bound(s) are inclusive, as one of "left", "right", "both" or + "neither". + """ + cdef CRangeClosed c_closed = self.range_ext_type.closed() + if c_closed == CRangeClosed.Left: + return "left" + elif c_closed == CRangeClosed.Right: + return "right" + elif c_closed == CRangeClosed.Both: + return "both" + else: + return "neither" + + def __arrow_ext_class__(self): + return RangeArray + + def __reduce__(self): + return range_, (self.value_type, self.closed) + + def __arrow_ext_scalar_class__(self): + return RangeScalar + + cdef class OpaqueType(BaseExtensionType): """ Concrete class for opaque extension type. @@ -5706,6 +5762,78 @@ def bool8(): return out +def range_(DataType value_type not None, str closed="left", allow_unbounded=True): + """ + Create instance of range extension type. + + Parameters + ---------- + value_type : DataType + The orderable Arrow type of the "lower" and "upper" interval bounds. + closed : str, default "left" + Which bound(s) are inclusive. One of "left", "right", "both" or + "neither". + allow_unbounded : bool, default True + Whether each side may be unbounded (infinite). When True the "lower" and + "upper" storage fields are nullable (a null bound is an infinite + endpoint); when False both bounds are non-nullable and the range is + always finite. + + Examples + -------- + Create an instance of a range extension type: + + >>> import pyarrow as pa + >>> type = pa.range_(pa.int32(), "both") + >>> type + RangeType(extension) + + Inspect the data type: + + >>> type.value_type + DataType(int32) + >>> type.closed + 'both' + >>> type.storage_type + StructType(struct) + + Create a range array: + + >>> storage = pa.array( + ... [{"lower": 1, "upper": 5}, {"lower": None, "upper": 10}], + ... type.storage_type, + ... ) + >>> arr = pa.ExtensionArray.from_storage(type, storage) + >>> arr.type + RangeType(extension) + + Returns + ------- + type : RangeType + """ + + cdef CRangeClosed c_closed + if closed == "left": + c_closed = CRangeClosed.Left + elif closed == "right": + c_closed = CRangeClosed.Right + elif closed == "both": + c_closed = CRangeClosed.Both + elif closed == "neither": + c_closed = CRangeClosed.Neither + else: + raise ValueError( + f"Invalid value for range \"closed\" parameter: {closed!r}. " + "Expected one of: 'left', 'right', 'both', 'neither'.") + + cdef: + shared_ptr[CDataType] c_type = GetResultValue( + CRangeType.Make(value_type.sp_type, c_closed, allow_unbounded)) + RangeType out = RangeType.__new__(RangeType) + out.init(c_type) + return out + + def opaque(DataType storage_type, str type_name not None, str vendor_name not None): """ Create instance of opaque extension type.