Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions encodings/fsst/src/dfa/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,13 @@ enum LikeKind<'a> {

impl<'a> LikeKind<'a> {
fn parse(pattern: &'a [u8]) -> Option<Self> {
// The fast-path matchers below do not understand SQL LIKE escape sequences (e.g. `\%`
// matching a literal `%`). If the pattern contains a backslash we fall back to the
// general implementation, which correctly interprets escapes.
if pattern.contains(&b'\\') {
return None;
}

// `prefix%` (including just `%` where prefix is empty)
if let Some(prefix) = pattern.strip_suffix(b"%")
&& !prefix.contains(&b'%')
Expand Down
10 changes: 10 additions & 0 deletions encodings/fsst/src/dfa/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,16 @@ fn test_like_kind_parse() {
// Suffix and underscore patterns are not supported.
assert!(LikeKind::parse(b"%suffix").is_none());
assert!(LikeKind::parse(b"a_c").is_none());

// Patterns containing the SQL LIKE escape character must NOT be parsed by the fast path,
// because that path treats `%` and `_` literally and would misinterpret escapes. For
// example, `%\%` (the pattern produced by Spark's `endsWith("%")`) means "ends with `%`",
// not "contains `\`". The fast path should bail so the general implementation handles it.
assert!(LikeKind::parse(br"%\%").is_none());
assert!(LikeKind::parse(br"\%%").is_none());
assert!(LikeKind::parse(br"%\_%").is_none());
assert!(LikeKind::parse(br"\_%").is_none());
assert!(LikeKind::parse(br"%\\%").is_none());
}

/// No symbols — all bytes escaped. Simplest case to see the two tables.
Expand Down
7 changes: 6 additions & 1 deletion java/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -78,5 +78,10 @@ allprojects {
}
}

tasks.register("format").get().dependsOn("spotlessApply")
if (project.name == "vortex-spark_2.12") {
// vortex-spark_2.12 and vortex-spark_2.13 share a projectDir; format from the 2.13 variant only.
tasks.register("format") { enabled = false }
} else {
tasks.register("format").get().dependsOn("spotlessApply")
}
}
136 changes: 136 additions & 0 deletions java/vortex-jni/src/main/java/dev/vortex/api/Expression.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import com.google.common.base.Preconditions;
import dev.vortex.VortexCleaner;
import dev.vortex.jni.NativeExpression;
import java.math.BigInteger;
import java.util.Arrays;

/**
Expand Down Expand Up @@ -44,6 +45,19 @@ public static Expression column(String fieldName) {
return getItem(fieldName, root());
}

/**
* Access a nested field by walking {@code fieldNames} starting from the root of the array. With a single name this
* is equivalent to {@link #column(String)}.
*/
public static Expression column(String[] fieldNames) {
Preconditions.checkArgument(fieldNames.length > 0, "column requires at least one field name");
Expression expr = root();
for (String name : fieldNames) {
expr = getItem(name, expr);
}
return expr;
}

/** Project a subset of fields out of a struct expression. */
public static Expression select(String[] fieldNames, Expression child) {
return new Expression(NativeExpression.select(fieldNames, child.nativePointer()));
Expand Down Expand Up @@ -73,6 +87,33 @@ public static Expression isNull(Expression child) {
return new Expression(NativeExpression.isNull(child.nativePointer()));
}

public static Expression isNotNull(Expression child) {
return new Expression(NativeExpression.isNotNull(child.nativePointer()));
}

/**
* SQL {@code LIKE} pattern match.
*
* @param negated whether to invert the result (i.e. {@code NOT LIKE})
* @param caseInsensitive whether to perform case-insensitive matching ({@code ILIKE})
*/
public static Expression like(Expression child, Expression pattern, boolean negated, boolean caseInsensitive) {
return new Expression(
NativeExpression.like(child.nativePointer(), pattern.nativePointer(), negated, caseInsensitive));
}

/**
* {@code value BETWEEN lower AND upper}.
*
* @param lowerStrict {@code true} for {@code lower < value}; {@code false} for {@code lower <= value}.
* @param upperStrict {@code true} for {@code value < upper}; {@code false} for {@code value <= upper}.
*/
public static Expression between(
Expression value, Expression lower, Expression upper, boolean lowerStrict, boolean upperStrict) {
return new Expression(NativeExpression.between(
value.nativePointer(), lower.nativePointer(), upper.nativePointer(), lowerStrict, upperStrict));
}

public static Expression literal(boolean value) {
return new Expression(NativeExpression.literalBool(value, false));
}
Expand Down Expand Up @@ -109,6 +150,59 @@ public static Expression literal(String value) {
return new Expression(NativeExpression.literalString(value));
}

public static Expression literal(byte[] value) {
Preconditions.checkArgument(value != null, "use nullLiteral(DType.BINARY) for a null binary literal");
return new Expression(NativeExpression.literalBinary(value));
}

/**
* Create a decimal literal from its unscaled two's-complement big-endian byte representation (i.e. the value
* returned by {@link BigInteger#toByteArray()}).
*/
public static Expression literalDecimal(BigInteger unscaledValue, int precision, int scale) {
Preconditions.checkArgument(unscaledValue != null, "unscaledValue must not be null");
return new Expression(NativeExpression.literalDecimal(unscaledValue.toByteArray(), precision, scale, false));
}

/** Create a null decimal literal with the specified precision and scale. */
public static Expression nullLiteralDecimal(int precision, int scale) {
return new Expression(NativeExpression.literalDecimal(new byte[] {0}, precision, scale, true));
}

/**
* Create a Date literal. The {@code value} is the number of {@code unit} units since the Unix epoch.
*
* @param unit only {@link TimeUnit#DAYS} and {@link TimeUnit#MILLISECONDS} are valid for Date.
*/
public static Expression literalDate(long value, TimeUnit unit) {
return new Expression(NativeExpression.literalDate(value, unit.tag(), false));
}

/** Null Date literal. See {@link #literalDate(long, TimeUnit)} for the {@code unit} constraints. */
public static Expression nullLiteralDate(TimeUnit unit) {
return new Expression(NativeExpression.literalDate(0L, unit.tag(), true));
}

/**
* Create a Timestamp literal. The {@code value} is the number of {@code unit} units since the Unix epoch.
*
* @param timezone optional IANA timezone identifier (e.g. {@code "UTC"}, {@code "America/Los_Angeles"}). Pass
* {@code null} for a local (zone-naive) timestamp.
*/
public static Expression literalTimestamp(long value, TimeUnit unit, String timezone) {
return new Expression(NativeExpression.literalTimestamp(value, unit.tag(), timezone, false));
}

/** Null Timestamp literal. See {@link #literalTimestamp(long, TimeUnit, String)} for parameter semantics. */
public static Expression nullLiteralTimestamp(TimeUnit unit, String timezone) {
return new Expression(NativeExpression.literalTimestamp(0L, unit.tag(), timezone, true));
}

/** Create a typed null literal of the given primitive {@link DType}. */
public static Expression nullLiteral(DType dtype) {
return new Expression(NativeExpression.literalNull(dtype.tag()));
}

private static long[] nativePointers(Expression[] exprs) {
return Arrays.stream(exprs).mapToLong(Expression::nativePointer).toArray();
}
Expand Down Expand Up @@ -138,4 +232,46 @@ public byte code() {
return code;
}
}

/** Time units for Date/Timestamp literals. Tag values must match the Rust {@code parse_time_unit} table. */
public enum TimeUnit {
NANOSECONDS((byte) 0),
MICROSECONDS((byte) 1),
MILLISECONDS((byte) 2),
SECONDS((byte) 3),
DAYS((byte) 4);

private final byte tag;

TimeUnit(byte tag) {
this.tag = tag;
}

public byte tag() {
return tag;
}
}

/** Primitive {@link DType}s that can be used to construct typed null literals via {@link #nullLiteral(DType)}. */
public enum DType {
BOOL((byte) 0),
I8((byte) 1),
I16((byte) 2),
I32((byte) 3),
I64((byte) 4),
F32((byte) 5),
F64((byte) 6),
UTF8((byte) 7),
BINARY((byte) 8);

private final byte tag;

DType(byte tag) {
this.tag = tag;
}

public byte tag() {
return tag;
}
}
}
17 changes: 17 additions & 0 deletions java/vortex-jni/src/main/java/dev/vortex/jni/NativeExpression.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,13 @@ private NativeExpression() {}

public static native long isNull(long childPointer);

public static native long isNotNull(long childPointer);

public static native long like(long childPointer, long patternPointer, boolean negated, boolean caseInsensitive);

public static native long between(
long valuePointer, long lowerPointer, long upperPointer, boolean lowerStrict, boolean upperStrict);

public static native long literalBool(boolean value, boolean isNull);

public static native long literalI8(byte value, boolean isNull);
Expand All @@ -43,5 +50,15 @@ private NativeExpression() {}

public static native long literalString(String value);

public static native long literalBinary(byte[] value);

public static native long literalDecimal(byte[] unscaledBigEndian, int precision, int scale, boolean isNull);

public static native long literalDate(long value, byte timeUnitTag, boolean isNull);

public static native long literalTimestamp(long value, byte timeUnitTag, String timezone, boolean isNull);

public static native long literalNull(byte dtypeTag);

public static native void free(long pointer);
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.connector.catalog.Table;
import org.apache.spark.sql.connector.catalog.TableProvider;
import org.apache.spark.sql.connector.expressions.Expressions;
import org.apache.spark.sql.connector.expressions.Transform;
import org.apache.spark.sql.sources.DataSourceRegister;
import org.apache.spark.sql.types.DataType;
Expand Down Expand Up @@ -118,6 +119,38 @@ public StructType inferSchema(CaseInsensitiveStringMap options) {
return dataSchema;
}

/**
* Infers partition transforms by inspecting Hive-style {@code key=value} segments in the first listed file path.
*
* <p>Spark calls this before {@link #getTable(StructType, Transform[], Map)} when the caller did not provide
* explicit partitioning. Returning identity transforms here lets downstream components (notably
* {@link dev.vortex.spark.read.VortexScanBuilder}) tell which schema columns are encoded in the directory layout
* rather than stored inside the Vortex files, which matters for predicate pushdown.
*/
@Override
public Transform[] inferPartitioning(CaseInsensitiveStringMap options) {
var paths = getPaths(options);
if (paths.isEmpty()) {
return new Transform[0];
}
var formatOptions = buildDataSourceOptions(options.asCaseSensitiveMap());
String pathToInfer = Objects.requireNonNull(Iterables.getLast(paths));
if (!pathToInfer.endsWith(".vortex")) {
Optional<String> firstFile =
NativeFiles.listFiles(VortexSparkSession.get(formatOptions), pathToInfer, formatOptions).stream()
.findFirst();
if (firstFile.isEmpty()) {
return new Transform[0];
}
pathToInfer = firstFile.get();
}
Map<String, String> partitionValues = PartitionPathUtils.parsePartitionValues(pathToInfer);
if (partitionValues.isEmpty()) {
return new Transform[0];
}
return partitionValues.keySet().stream().map(Expressions::identity).toArray(Transform[]::new);
}

/**
* Creates a Vortex table instance with the given schema and properties.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ public ScanBuilder newScanBuilder(CaseInsensitiveStringMap options) {
Map<String, String> opts = Maps.newHashMap();
opts.putAll(formatOptions);
opts.putAll(options);
return new VortexScanBuilder(opts)
return new VortexScanBuilder(opts, partitionTransforms)
.addAllPaths(paths)
.addAllColumns(Arrays.asList(CatalogV2Util.structTypeToV2Columns(schema)));
}
Expand Down

This file was deleted.

Loading
Loading