From ae1dd8ae923c701c9a5a05bef54634d834790311 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Manciot?= Date: Fri, 5 Jun 2026 10:39:37 +0200 Subject: [PATCH 1/6] feat(sql): support ORDER BY ... NULLS FIRST / NULLS LAST Translate the ANSI NULLS FIRST / NULLS LAST clause to Elasticsearch's sort.missing parameter (_first / _last). Add NullOrdering AST + parser support (case-insensitive), and apply .missing(...) on the FieldSort builder across the ES7/8/9 bridge template and the hand-maintained ES6 bridge. Reject NULLS ordering on aggregation / GROUP BY ORDER BY (ES terms aggregations have no missing parameter) rather than silently dropping it. Covered by ParserSpec round-trip/reject cases, SQLQuerySpec JSON-emission assertions, and per-version integration specs (es6/jest, es6/rest, es7/rest, es8/java, es9/java). Docs updated in dql_statements.md. Closed Issue #99 Co-Authored-By: Claude Opus 4.8 (1M context) --- .../elastic/sql/bridge/package.scala | 7 +- .../elastic/sql/SQLQuerySpec.scala | 38 +++++ documentation/sql/dql_statements.md | 37 ++++- .../elastic/sql/bridge/package.scala | 7 +- .../client/JestClientSortNullsSpec.scala | 19 +++ .../RestHighLevelClientSortNullsSpec.scala | 19 +++ .../RestHighLevelClientSortNullsSpec.scala | 19 +++ .../client/JavaClientSortNullsSpec.scala | 19 +++ .../client/JavaClientSortNullsSpec.scala | 19 +++ .../elastic/sql/parser/OrderByParser.scala | 20 ++- .../elastic/sql/query/OrderBy.scala | 12 +- .../elastic/sql/query/package.scala | 22 +++ .../elastic/sql/parser/ParserSpec.scala | 58 +++++++ .../elastic/client/SortNullsSpec.scala | 146 ++++++++++++++++++ 14 files changed, 434 insertions(+), 8 deletions(-) create mode 100644 es6/jest/src/test/scala/app/softnetwork/elastic/client/JestClientSortNullsSpec.scala create mode 100644 es6/rest/src/test/scala/app/softnetwork/elastic/client/RestHighLevelClientSortNullsSpec.scala create mode 100644 es7/rest/src/test/scala/app/softnetwork/elastic/client/RestHighLevelClientSortNullsSpec.scala create mode 100644 es8/java/src/test/scala/app/softnetwork/elastic/client/JavaClientSortNullsSpec.scala create mode 100644 es9/java/src/test/scala/app/softnetwork/elastic/client/JavaClientSortNullsSpec.scala create mode 100644 testkit/src/main/scala/app/softnetwork/elastic/client/SortNullsSpec.scala diff --git a/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/package.scala b/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/package.scala index 36c73605..f828e25a 100644 --- a/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/package.scala +++ b/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/package.scala @@ -579,10 +579,15 @@ package object bridge { case _ => scriptSort.asc() } } else { - sort.order match { + val baseSort = sort.order match { case Some(Desc) => FieldSort(sort.field.aliasOrName).desc() case _ => FieldSort(sort.field.aliasOrName).asc() } + sort.nullOrdering match { + case Some(NullsFirst) => baseSort.missing("_first") + case Some(NullsLast) => baseSort.missing("_last") + case None => baseSort + } } } case _ => _search diff --git a/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala b/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala index cedde523..24467835 100644 --- a/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala +++ b/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala @@ -4193,4 +4193,42 @@ class SQLQuerySpec extends AnyFlatSpec with Matchers { .replaceAll(">", " > ") } + // === Story 14.1: NULLS FIRST / NULLS LAST on ORDER BY === + + it should "emit sort with missing=_last for ORDER BY ... DESC NULLS LAST" in { + val select: ElasticSearchRequest = + SelectStatement("SELECT identifier FROM Table ORDER BY identifier DESC NULLS LAST") + val query = select.query + query should include("\"missing\":\"_last\"") + query should include("\"order\":\"desc\"") + } + + it should "emit sort with missing=_first for ORDER BY ... ASC NULLS FIRST" in { + val select: ElasticSearchRequest = + SelectStatement("SELECT identifier FROM Table ORDER BY identifier ASC NULLS FIRST") + val query = select.query + query should include("\"missing\":\"_first\"") + query should include("\"order\":\"asc\"") + } + + it should "emit per-field null ordering for multi-column ORDER BY" in { + val select: ElasticSearchRequest = + SelectStatement( + "SELECT identifier FROM Table ORDER BY a DESC NULLS LAST, b ASC NULLS FIRST" + ) + val query = select.query + // Bind each missing value to its own sort field so a swapped mapping + // (e.g. NullsFirst -> _last) cannot pass: the missing key must live inside + // the same JSON object as the field it belongs to. + query should include regex """"a":\{[^}]*"missing":"_last"""" + query should include regex """"b":\{[^}]*"missing":"_first"""" + } + + it should "omit the missing field when no null ordering specified" in { + val select: ElasticSearchRequest = + SelectStatement("SELECT identifier FROM Table ORDER BY identifier DESC") + val query = select.query + query should not include "\"missing\"" + } + } diff --git a/documentation/sql/dql_statements.md b/documentation/sql/dql_statements.md index 586d3feb..cb9ce846 100644 --- a/documentation/sql/dql_statements.md +++ b/documentation/sql/dql_statements.md @@ -62,7 +62,7 @@ FROM table_name [alias] [WHERE condition] [GROUP BY expr1, expr2, ...] [HAVING condition] -[ORDER BY expr1 [ASC|DESC], ...] +[ORDER BY expr1 [ASC|DESC] [NULLS FIRST|NULLS LAST], ...] [LIMIT n] [OFFSET m]; ``` @@ -127,6 +127,7 @@ WHERE age BETWEEN 20 AND 50 - Supports multiple sort keys - Supports `ASC` and `DESC` +- Supports `NULLS FIRST` / `NULLS LAST` per sort key (see below) - Supports expressions and nested fields (e.g., `profile.city`) - When used inside a window function (`OVER`), `ORDER BY` defines the logical ordering of the window @@ -139,6 +140,40 @@ ORDER BY age DESC, name ASC LIMIT 2 OFFSET 1; ``` +### NULLS FIRST / NULLS LAST + +Each sort key may declare where `NULL` values appear in the result: + +```sql +SELECT id, name, bonus +FROM dql_users +ORDER BY bonus DESC NULLS LAST; +``` + +Mapped to Elasticsearch's `sort.missing` parameter: + +- `NULLS FIRST` → `"missing": "_first"` +- `NULLS LAST` → `"missing": "_last"` + +When `NULLS FIRST` / `NULLS LAST` is omitted, defaults follow the Elasticsearch +convention: + +- `ASC` → nulls last +- `DESC` → nulls first + +Different null orderings can be combined within a single query: + +```sql +SELECT id, name, bonus, hire_date +FROM dql_users +ORDER BY bonus DESC NULLS LAST, hire_date ASC NULLS FIRST; +``` + +**Caveat (ES6 Jest client)**: scroll / `search_after` queries in the ES6 Jest +client do not propagate `NULLS FIRST` / `NULLS LAST` reliably across batches +(search_after's null handling is implementation-defined in Jest). For ES6 +scroll/search_after, prefer client-side null-bucketing or upgrade to ES7+. + --- ## LIMIT / OFFSET diff --git a/es6/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/package.scala b/es6/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/package.scala index 1a1b2ad6..6f3672e9 100644 --- a/es6/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/package.scala +++ b/es6/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/package.scala @@ -575,10 +575,15 @@ package object bridge { case _ => scriptSort.asc() } } else { - sort.order match { + val baseSort = sort.order match { case Some(Desc) => FieldSort(sort.field.aliasOrName).desc() case _ => FieldSort(sort.field.aliasOrName).asc() } + sort.nullOrdering match { + case Some(NullsFirst) => baseSort.missing("_first") + case Some(NullsLast) => baseSort.missing("_last") + case None => baseSort + } } } case _ => _search diff --git a/es6/jest/src/test/scala/app/softnetwork/elastic/client/JestClientSortNullsSpec.scala b/es6/jest/src/test/scala/app/softnetwork/elastic/client/JestClientSortNullsSpec.scala new file mode 100644 index 00000000..da342af9 --- /dev/null +++ b/es6/jest/src/test/scala/app/softnetwork/elastic/client/JestClientSortNullsSpec.scala @@ -0,0 +1,19 @@ +/* + * Copyright 2025 SOFTNETWORK + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package app.softnetwork.elastic.client + +class JestClientSortNullsSpec extends SortNullsSpec diff --git a/es6/rest/src/test/scala/app/softnetwork/elastic/client/RestHighLevelClientSortNullsSpec.scala b/es6/rest/src/test/scala/app/softnetwork/elastic/client/RestHighLevelClientSortNullsSpec.scala new file mode 100644 index 00000000..d4b40dd8 --- /dev/null +++ b/es6/rest/src/test/scala/app/softnetwork/elastic/client/RestHighLevelClientSortNullsSpec.scala @@ -0,0 +1,19 @@ +/* + * Copyright 2025 SOFTNETWORK + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package app.softnetwork.elastic.client + +class RestHighLevelClientSortNullsSpec extends SortNullsSpec diff --git a/es7/rest/src/test/scala/app/softnetwork/elastic/client/RestHighLevelClientSortNullsSpec.scala b/es7/rest/src/test/scala/app/softnetwork/elastic/client/RestHighLevelClientSortNullsSpec.scala new file mode 100644 index 00000000..d4b40dd8 --- /dev/null +++ b/es7/rest/src/test/scala/app/softnetwork/elastic/client/RestHighLevelClientSortNullsSpec.scala @@ -0,0 +1,19 @@ +/* + * Copyright 2025 SOFTNETWORK + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package app.softnetwork.elastic.client + +class RestHighLevelClientSortNullsSpec extends SortNullsSpec diff --git a/es8/java/src/test/scala/app/softnetwork/elastic/client/JavaClientSortNullsSpec.scala b/es8/java/src/test/scala/app/softnetwork/elastic/client/JavaClientSortNullsSpec.scala new file mode 100644 index 00000000..6d2ebb2c --- /dev/null +++ b/es8/java/src/test/scala/app/softnetwork/elastic/client/JavaClientSortNullsSpec.scala @@ -0,0 +1,19 @@ +/* + * Copyright 2025 SOFTNETWORK + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package app.softnetwork.elastic.client + +class JavaClientSortNullsSpec extends SortNullsSpec diff --git a/es9/java/src/test/scala/app/softnetwork/elastic/client/JavaClientSortNullsSpec.scala b/es9/java/src/test/scala/app/softnetwork/elastic/client/JavaClientSortNullsSpec.scala new file mode 100644 index 00000000..6d2ebb2c --- /dev/null +++ b/es9/java/src/test/scala/app/softnetwork/elastic/client/JavaClientSortNullsSpec.scala @@ -0,0 +1,19 @@ +/* + * Copyright 2025 SOFTNETWORK + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package app.softnetwork.elastic.client + +class JavaClientSortNullsSpec extends SortNullsSpec diff --git a/sql/src/main/scala/app/softnetwork/elastic/sql/parser/OrderByParser.scala b/sql/src/main/scala/app/softnetwork/elastic/sql/parser/OrderByParser.scala index 49e3efeb..d03db314 100644 --- a/sql/src/main/scala/app/softnetwork/elastic/sql/parser/OrderByParser.scala +++ b/sql/src/main/scala/app/softnetwork/elastic/sql/parser/OrderByParser.scala @@ -18,7 +18,15 @@ package app.softnetwork.elastic.sql.parser import app.softnetwork.elastic.sql.Identifier import app.softnetwork.elastic.sql.function.Function -import app.softnetwork.elastic.sql.query.{Asc, Desc, FieldSort, OrderBy} +import app.softnetwork.elastic.sql.query.{ + Asc, + Desc, + FieldSort, + NullOrdering, + NullsFirst, + NullsLast, + OrderBy +} trait OrderByParser { self: Parser => @@ -27,6 +35,12 @@ trait OrderByParser { def desc: PackratParser[Desc.type] = Desc.regex ^^ (_ => Desc) + def nullsFirst: PackratParser[NullsFirst.type] = NullsFirst.regex ^^ (_ => NullsFirst) + + def nullsLast: PackratParser[NullsLast.type] = NullsLast.regex ^^ (_ => NullsLast) + + def nullOrdering: PackratParser[NullOrdering] = nullsFirst | nullsLast + private def fieldName: PackratParser[String] = """\b(?!(?i)limit\b)[a-zA-Z_][a-zA-Z0-9_]*""".r ^^ (f => f) @@ -41,8 +55,8 @@ trait OrderByParser { identifier def sort: PackratParser[FieldSort] = - fieldWithFunction ~ (asc | desc).? ^^ { case f ~ o => - FieldSort(f, o) + fieldWithFunction ~ (asc | desc).? ~ nullOrdering.? ^^ { case f ~ o ~ n => + FieldSort(f, o, n) } def orderBy: PackratParser[OrderBy] = OrderBy.regex ~ rep1sep(sort, separator) ^^ { case _ ~ s => diff --git a/sql/src/main/scala/app/softnetwork/elastic/sql/query/OrderBy.scala b/sql/src/main/scala/app/softnetwork/elastic/sql/query/OrderBy.scala index b0a4f062..910f2ec5 100644 --- a/sql/src/main/scala/app/softnetwork/elastic/sql/query/OrderBy.scala +++ b/sql/src/main/scala/app/softnetwork/elastic/sql/query/OrderBy.scala @@ -27,15 +27,23 @@ case object Desc extends Expr("DESC") with SortOrder case object Asc extends Expr("ASC") with SortOrder +sealed trait NullOrdering extends TokenRegex + +case object NullsFirst extends Expr("NULLS FIRST") with NullOrdering + +case object NullsLast extends Expr("NULLS LAST") with NullOrdering + case class FieldSort( field: Identifier, - order: Option[SortOrder] + order: Option[SortOrder], + nullOrdering: Option[NullOrdering] = None ) extends FunctionChain with Updateable { lazy val functions: List[Function] = field.functions lazy val direction: SortOrder = order.getOrElse(Asc) lazy val name: String = field.identifierName - override def sql: String = s"$name $direction" + override def sql: String = + s"$name $direction${nullOrdering.map(n => s" ${n.sql}").getOrElse("")}" override def update(request: SingleSearch): FieldSort = this.copy( field = field.update(request) ) diff --git a/sql/src/main/scala/app/softnetwork/elastic/sql/query/package.scala b/sql/src/main/scala/app/softnetwork/elastic/sql/query/package.scala index 1434d4b5..d4414989 100644 --- a/sql/src/main/scala/app/softnetwork/elastic/sql/query/package.scala +++ b/sql/src/main/scala/app/softnetwork/elastic/sql/query/package.scala @@ -336,6 +336,28 @@ package object query { Right(()) } } + _ <- { + // NULLS FIRST / NULLS LAST cannot be honored when the query is an + // aggregation / GROUP BY query: Elasticsearch terms aggregations have + // no `missing` parameter, so the bridge routes these sorts through the + // aggregation path (guarded by `aggregates.isEmpty && buckets.isEmpty`) + // and would silently drop the null ordering. Reject explicitly. + if (aggregates.nonEmpty || buckets.nonEmpty) { + val nullOrdered = + orderBy.map(_.sorts.filter(_.nullOrdering.isDefined)).getOrElse(Seq.empty) + if (nullOrdered.nonEmpty) { + Left( + s"NULLS FIRST / NULLS LAST is not supported on ORDER BY when GROUP BY or aggregations are present (offending sort: ${nullOrdered + .map(_.sql) + .mkString(", ")})" + ) + } else { + Right(()) + } + } else { + Right(()) + } + } } yield () } diff --git a/sql/src/test/scala/app/softnetwork/elastic/sql/parser/ParserSpec.scala b/sql/src/test/scala/app/softnetwork/elastic/sql/parser/ParserSpec.scala index e7bde455..d6faa351 100644 --- a/sql/src/test/scala/app/softnetwork/elastic/sql/parser/ParserSpec.scala +++ b/sql/src/test/scala/app/softnetwork/elastic/sql/parser/ParserSpec.scala @@ -108,6 +108,12 @@ object Queries { val groupBy = "SELECT identifier, COUNT(identifier2) FROM Table WHERE identifier2 is NOT null GROUP BY identifier" val orderBy = "SELECT * FROM Table ORDER BY identifier DESC" + val orderByNullsFirst = "SELECT * FROM Table ORDER BY identifier DESC NULLS FIRST" + val orderByNullsLast = "SELECT * FROM Table ORDER BY identifier ASC NULLS LAST" + val orderByMixedNulls = + "SELECT * FROM Table ORDER BY a DESC NULLS LAST, b ASC NULLS FIRST" + val orderByLowerNulls = "SELECT * FROM Table ORDER BY id desc nulls first" + val orderByNullsNoDirection = "SELECT * FROM Table ORDER BY identifier NULLS LAST" val limit = "SELECT * FROM Table LIMIT 10 OFFSET 2" val groupByWithOrderByAndLimit: String = """SELECT identifier, COUNT(identifier2) @@ -641,6 +647,58 @@ class ParserSpec extends AnyFlatSpec with Matchers { .equalsIgnoreCase(orderBy) shouldBe true } + it should "parse ORDER BY with NULLS FIRST" in { + val result = Parser(orderByNullsFirst) + result.toOption + .map(_.sql) + .getOrElse("") + .equalsIgnoreCase(orderByNullsFirst) shouldBe true + } + + it should "parse ORDER BY with NULLS LAST" in { + val result = Parser(orderByNullsLast) + result.toOption + .map(_.sql) + .getOrElse("") + .equalsIgnoreCase(orderByNullsLast) shouldBe true + } + + it should "parse ORDER BY with mixed null ordering" in { + val result = Parser(orderByMixedNulls) + result.toOption + .map(_.sql) + .getOrElse("") + .equalsIgnoreCase(orderByMixedNulls) shouldBe true + } + + it should "parse ORDER BY with lowercase nulls first" in { + val result = Parser(orderByLowerNulls) + result.toOption + .map(_.sql) + .getOrElse("") + .equalsIgnoreCase(orderByLowerNulls) shouldBe true + } + + it should "parse ORDER BY with NULLS ordering and no explicit direction" in { + // No ASC/DESC: the grammar still accepts NULLS LAST; the default ASC is + // injected on round-trip. + val result = Parser(orderByNullsNoDirection) + result.toOption + .map(_.sql) + .getOrElse("") + .equalsIgnoreCase("SELECT * FROM Table ORDER BY identifier ASC NULLS LAST") shouldBe true + } + + it should "reject NULLS FIRST / NULLS LAST on a GROUP BY / aggregation ORDER BY" in { + // ES terms aggregations have no `missing` parameter, so null ordering cannot + // be honored on a grouped/aggregated sort and must be rejected rather than + // silently dropped. + val result = Parser( + "SELECT identifier, COUNT(identifier2) FROM Table GROUP BY identifier ORDER BY identifier DESC NULLS LAST" + ) + result.isLeft shouldBe true + } + it should "parse LIMIT" in { val result = Parser(limit) result.toOption diff --git a/testkit/src/main/scala/app/softnetwork/elastic/client/SortNullsSpec.scala b/testkit/src/main/scala/app/softnetwork/elastic/client/SortNullsSpec.scala new file mode 100644 index 00000000..3d0e30e7 --- /dev/null +++ b/testkit/src/main/scala/app/softnetwork/elastic/client/SortNullsSpec.scala @@ -0,0 +1,146 @@ +/* + * Copyright 2025 SOFTNETWORK + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package app.softnetwork.elastic.client + +import akka.NotUsed +import akka.actor.ActorSystem +import akka.stream.scaladsl.Source +import app.softnetwork.elastic.client.bulk._ +import app.softnetwork.elastic.client.result.{ElasticFailure, ElasticSuccess} +import app.softnetwork.elastic.client.spi.ElasticClientFactory +import app.softnetwork.elastic.scalatest.ElasticDockerTestKit +import app.softnetwork.persistence.generateUUID +import org.scalatest.flatspec.AnyFlatSpecLike +import org.scalatest.matchers.should.Matchers +import org.slf4j.{Logger, LoggerFactory} + +import scala.language.implicitConversions + +/** Integration tests for Story 14.1: `NULLS FIRST` / `NULLS LAST` on `ORDER BY`. Verifies that the + * bridge's `missing` parameter actually controls null placement when executed against a live ES + * instance. + * + * Fixture: 4 documents in the `sort_nulls` index with `bonus` set to (100, null, 50, missing). The + * null and missing cases both translate to "no value" in ES; the `missing` sort parameter applies + * to both. + */ +trait SortNullsSpec extends AnyFlatSpecLike with ElasticDockerTestKit with Matchers { + + lazy val log: Logger = LoggerFactory.getLogger(getClass.getName) + + implicit val system: ActorSystem = ActorSystem(generateUUID()) + + lazy val client: ElasticClientApi = ElasticClientFactory.create(elasticConfig) + + override def beforeAll(): Unit = { + super.beforeAll() + + val mapping = + """{ + | "properties": { + | "name": { "type": "keyword" }, + | "bonus": { "type": "integer" } + | } + |}""".stripMargin + + client.createIndex("sort_nulls", mappings = None, aliases = Nil).get shouldBe true + client.setMapping("sort_nulls", mapping).get shouldBe true + + val docs = List( + """{"id":"1","name":"alice","bonus":100}""", + """{"id":"2","name":"bob","bonus":null}""", + """{"id":"3","name":"carol","bonus":50}""", + """{"id":"4","name":"dave"}""" + ) + + implicit val bulkOptions: BulkOptions = BulkOptions( + defaultIndex = "sort_nulls", + logEvery = 5 + ) + + implicit def listToSource[T](list: List[T]): Source[T, NotUsed] = + Source.fromIterator(() => list.iterator) + + client.bulk[String](docs, identity, idKey = Some(Set("id"))) match { + case ElasticSuccess(_) => // ok + case ElasticFailure(error) => + error.cause.foreach(_.printStackTrace()) + fail(s"Bulk indexing failed: ${error.message}") + } + + client.refresh("sort_nulls") + } + + override def afterAll(): Unit = { + client.deleteIndex("sort_nulls") + super.afterAll() + } + + "ORDER BY bonus DESC NULLS LAST" should "place non-null bonuses before null bonuses" in { + val results = client.searchAs[SortNullsRow]( + "SELECT name, bonus FROM sort_nulls ORDER BY bonus DESC NULLS LAST" + ) + + results match { + case ElasticSuccess(rows) => + rows should have size 4 + val nonNullFirst = rows.takeWhile(_.bonus.isDefined) + val nullsLast = rows.dropWhile(_.bonus.isDefined) + nonNullFirst.map(_.name) shouldBe Seq("alice", "carol") + nullsLast.map(_.name) should contain theSameElementsAs Seq("bob", "dave") + log.info(s"NULLS LAST order: ${rows.map(_.name).mkString(", ")}") + + case ElasticFailure(error) => fail(s"Query failed: ${error.message}") + } + } + + it should "place null bonuses before non-null bonuses with NULLS FIRST" in { + val results = client.searchAs[SortNullsRow]( + "SELECT name, bonus FROM sort_nulls ORDER BY bonus DESC NULLS FIRST" + ) + + results match { + case ElasticSuccess(rows) => + rows should have size 4 + val nullsFirst = rows.takeWhile(_.bonus.isEmpty) + val nonNulls = rows.dropWhile(_.bonus.isEmpty) + nullsFirst.map(_.name) should contain theSameElementsAs Seq("bob", "dave") + nonNulls.map(_.name) shouldBe Seq("alice", "carol") + log.info(s"NULLS FIRST order: ${rows.map(_.name).mkString(", ")}") + + case ElasticFailure(error) => fail(s"Query failed: ${error.message}") + } + } + + it should "default to nulls last for ASC when ordering is unspecified" in { + val results = client.searchAs[SortNullsRow]( + "SELECT name, bonus FROM sort_nulls ORDER BY bonus ASC" + ) + + results match { + case ElasticSuccess(rows) => + rows should have size 4 + rows.take(2).map(_.name) shouldBe Seq("carol", "alice") + rows.drop(2).flatMap(_.bonus) shouldBe empty + + case ElasticFailure(error) => fail(s"Query failed: ${error.message}") + } + } + +} + +case class SortNullsRow(name: String, bonus: Option[Int] = None) From 3deb6858e64a6613529d0260dca4698866a4015c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Manciot?= Date: Fri, 5 Jun 2026 15:08:53 +0200 Subject: [PATCH 2/6] feat(sql): add GREATEST / LEAST conditional functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit N-ary GREATEST(e1, e2, …) / LEAST(e1, e2, …) with ANSI null handling: NULL args are ignored; result is NULL only when every arg is NULL. Numeric scope. Emitted as a right-folded nested-ternary Painless script field over Math.max / Math.min. - Nullability-aware guard: non-nullable args (literals) are not wrapped in `== null` (Painless rejects ` == null`). - validate() accepts numeric and unresolved (SQLAny) args, rejecting only definitively non-numeric types. - Help JSON, docs (dql_statements.md), parser/bridge/integration tests. Closed Issue #100 Co-Authored-By: Claude Opus 4.8 (1M context) --- .../elastic/sql/SQLQuerySpec.scala | 41 +++++ .../help/functions/conditional/_index.json | 4 +- .../help/functions/conditional/greatest.json | 38 +++++ .../help/functions/conditional/least.json | 38 +++++ documentation/sql/dql_statements.md | 18 +++ .../elastic/sql/SQLQuerySpec.scala | 71 +++++++++ .../client/JestClientGreatestLeastSpec.scala | 19 +++ ...RestHighLevelClientGreatestLeastSpec.scala | 19 +++ ...RestHighLevelClientGreatestLeastSpec.scala | 19 +++ .../client/JavaClientGreatestLeastSpec.scala | 19 +++ .../client/JavaClientGreatestLeastSpec.scala | 19 +++ .../elastic/sql/function/cond/package.scala | 123 +++++++++++++++ .../elastic/sql/parser/Parser.scala | 2 + .../sql/parser/function/cond/package.scala | 14 +- .../elastic/sql/parser/ParserSpec.scala | 55 +++++++ .../elastic/client/GreatestLeastSpec.scala | 148 ++++++++++++++++++ 16 files changed, 645 insertions(+), 2 deletions(-) create mode 100644 core/src/main/resources/help/functions/conditional/greatest.json create mode 100644 core/src/main/resources/help/functions/conditional/least.json create mode 100644 es6/jest/src/test/scala/app/softnetwork/elastic/client/JestClientGreatestLeastSpec.scala create mode 100644 es6/rest/src/test/scala/app/softnetwork/elastic/client/RestHighLevelClientGreatestLeastSpec.scala create mode 100644 es7/rest/src/test/scala/app/softnetwork/elastic/client/RestHighLevelClientGreatestLeastSpec.scala create mode 100644 es8/java/src/test/scala/app/softnetwork/elastic/client/JavaClientGreatestLeastSpec.scala create mode 100644 es9/java/src/test/scala/app/softnetwork/elastic/client/JavaClientGreatestLeastSpec.scala create mode 100644 testkit/src/main/scala/app/softnetwork/elastic/client/GreatestLeastSpec.scala diff --git a/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala b/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala index 24467835..f71f71c6 100644 --- a/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala +++ b/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala @@ -2096,6 +2096,47 @@ class SQLQuerySpec extends AnyFlatSpec with Matchers { .replaceAll(",ZoneId.of", ", ZoneId.of") } + it should "handle GREATEST 2-arg as script field" in { + val select: ElasticSearchRequest = SelectStatement(greatest2) + val query = select.query + println(query) + query shouldBe + """{"query":{"match_all":{}},"script_fields":{"hi":{"script":{"lang":"painless","source":"def param1 = (doc['price_us'].size() == 0 ? null : doc['price_us'].value); def param2 = (doc['price_eu'].size() == 0 ? null : doc['price_eu'].value); (param1 == null ? param2 : (param2 == null ? param1 : Math.max(param1, param2)))"}}},"_source":true}""" + } + + it should "handle GREATEST 3-arg as script field" in { + val select: ElasticSearchRequest = SelectStatement(greatest3) + val query = select.query + println(query) + query shouldBe + """{"query":{"match_all":{}},"script_fields":{"hi":{"script":{"lang":"painless","source":"def param1 = (doc['price_us'].size() == 0 ? null : doc['price_us'].value); def param2 = (doc['price_eu'].size() == 0 ? null : doc['price_eu'].value); def param3 = (doc['price_uk'].size() == 0 ? null : doc['price_uk'].value); (param1 == null ? (param2 == null ? param3 : (param3 == null ? param2 : Math.max(param2, param3))) : ((param2 == null ? param3 : (param3 == null ? param2 : Math.max(param2, param3))) == null ? param1 : Math.max(param1, (param2 == null ? param3 : (param3 == null ? param2 : Math.max(param2, param3))))))"}}},"_source":{"includes":["sku"]}}""" + } + + it should "handle LEAST 2-arg as script field" in { + val select: ElasticSearchRequest = SelectStatement(least2) + val query = select.query + println(query) + query shouldBe + """{"query":{"match_all":{}},"script_fields":{"lo":{"script":{"lang":"painless","source":"def param1 = (doc['price_us'].size() == 0 ? null : doc['price_us'].value); def param2 = (doc['price_eu'].size() == 0 ? null : doc['price_eu'].value); (param1 == null ? param2 : (param2 == null ? param1 : Math.min(param1, param2)))"}}},"_source":true}""" + } + + it should "handle LEAST 3-arg as script field" in { + val select: ElasticSearchRequest = SelectStatement(least3) + val query = select.query + println(query) + query shouldBe + """{"query":{"match_all":{}},"script_fields":{"lo":{"script":{"lang":"painless","source":"def param1 = (doc['price_us'].size() == 0 ? null : doc['price_us'].value); def param2 = (doc['price_eu'].size() == 0 ? null : doc['price_eu'].value); def param3 = (doc['price_uk'].size() == 0 ? null : doc['price_uk'].value); (param1 == null ? (param2 == null ? param3 : (param3 == null ? param2 : Math.min(param2, param3))) : ((param2 == null ? param3 : (param3 == null ? param2 : Math.min(param2, param3))) == null ? param1 : Math.min(param1, (param2 == null ? param3 : (param3 == null ? param2 : Math.min(param2, param3))))))"}}},"_source":{"includes":["sku"]}}""" + } + + it should "not guard non-nullable (literal) GREATEST args with == null" in { + val select: ElasticSearchRequest = SelectStatement(greatestLiteral) + val query = select.query + println(query) + // Painless rejects ` == null`; a literal arg must not be guarded. + query should not include "0 == null" + query should include("Math.max(0, param1)") + } + it should "handle cast function as script field" in { val select: ElasticSearchRequest = SelectStatement(conversion) diff --git a/core/src/main/resources/help/functions/conditional/_index.json b/core/src/main/resources/help/functions/conditional/_index.json index 25c8ebc4..97ffb992 100644 --- a/core/src/main/resources/help/functions/conditional/_index.json +++ b/core/src/main/resources/help/functions/conditional/_index.json @@ -3,5 +3,7 @@ "nullif.json", "coalesce.json", "isnull.json", - "isnotnull.json" + "isnotnull.json", + "greatest.json", + "least.json" ] diff --git a/core/src/main/resources/help/functions/conditional/greatest.json b/core/src/main/resources/help/functions/conditional/greatest.json new file mode 100644 index 00000000..df7287bc --- /dev/null +++ b/core/src/main/resources/help/functions/conditional/greatest.json @@ -0,0 +1,38 @@ +{ + "name": "GREATEST", + "category": "Conditional", + "shortDescription": "Return the largest non-NULL value", + "syntax": [ + "GREATEST(expression1, expression2, ...)" + ], + "description": "Returns the largest value among the given numeric expressions. NULL arguments are ignored; the result is NULL only if every argument is NULL.", + "parameters": [ + { + "name": "expressions", + "type": "NUMERIC", + "description": "One or more numeric expressions to compare", + "optional": false, + "defaultValue": null + } + ], + "returnType": "Numeric (widest input type)", + "examples": [ + { + "title": "Highest of multiple prices", + "description": "Pick the largest price across markets", + "sql": "SELECT GREATEST(price_us, price_eu, price_uk) AS max_price FROM products" + }, + { + "title": "Floor on a computed value", + "description": "Never let the discount go below zero", + "sql": "SELECT GREATEST(0, base_price - rebate) AS net FROM orders" + } + ], + "notes": [ + "Ignores NULL arguments (ANSI semantics)", + "Returns NULL only if every argument is NULL", + "All arguments should be numeric and have comparable types" + ], + "seeAlso": ["LEAST", "COALESCE"], + "aliases": [] +} diff --git a/core/src/main/resources/help/functions/conditional/least.json b/core/src/main/resources/help/functions/conditional/least.json new file mode 100644 index 00000000..79d1531e --- /dev/null +++ b/core/src/main/resources/help/functions/conditional/least.json @@ -0,0 +1,38 @@ +{ + "name": "LEAST", + "category": "Conditional", + "shortDescription": "Return the smallest non-NULL value", + "syntax": [ + "LEAST(expression1, expression2, ...)" + ], + "description": "Returns the smallest value among the given numeric expressions. NULL arguments are ignored; the result is NULL only if every argument is NULL.", + "parameters": [ + { + "name": "expressions", + "type": "NUMERIC", + "description": "One or more numeric expressions to compare", + "optional": false, + "defaultValue": null + } + ], + "returnType": "Numeric (widest input type)", + "examples": [ + { + "title": "Lowest of multiple prices", + "description": "Pick the smallest price across markets", + "sql": "SELECT LEAST(price_us, price_eu, price_uk) AS min_price FROM products" + }, + { + "title": "Cap on a computed value", + "description": "Never let the rebate exceed the base price", + "sql": "SELECT LEAST(rebate, base_price) AS applied_rebate FROM orders" + } + ], + "notes": [ + "Ignores NULL arguments (ANSI semantics)", + "Returns NULL only if every argument is NULL", + "All arguments should be numeric and have comparable types" + ], + "seeAlso": ["GREATEST", "COALESCE"], + "aliases": [] +} diff --git a/documentation/sql/dql_statements.md b/documentation/sql/dql_statements.md index cb9ce846..fbc5eb99 100644 --- a/documentation/sql/dql_statements.md +++ b/documentation/sql/dql_statements.md @@ -664,6 +664,24 @@ NULLIF(a, b) Returns NULL if `a = b`, otherwise `a`. +##### GREATEST / LEAST + +```sql +GREATEST(e1, e2, ...) +LEAST(e1, e2, ...) +``` + +`GREATEST` returns the largest non-null numeric value among the given expressions; `LEAST` +returns the smallest. NULL arguments are ignored (ANSI semantics); the result is NULL only +when every argument is NULL. Both are emitted as Painless ternary chains over +`Math.max` / `Math.min`. They are row-level conditional functions, not aggregates — +`GREATEST(...) OVER (...)` is not supported. + +```sql +SELECT GREATEST(price_us, price_eu, price_uk) AS max_price FROM products; +SELECT LEAST(0, base_price - rebate) AS net FROM orders; +``` + **Example:** ```sql diff --git a/es6/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala b/es6/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala index 7f0fbee3..7d79c69c 100644 --- a/es6/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala +++ b/es6/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala @@ -2096,6 +2096,38 @@ class SQLQuerySpec extends AnyFlatSpec with Matchers { .replaceAll(",ZoneId.of", ", ZoneId.of") } + it should "handle GREATEST 2-arg as script field" in { + val select: ElasticSearchRequest = SelectStatement(greatest2) + val query = select.query + println(query) + query shouldBe + """{"query":{"match_all":{}},"script_fields":{"hi":{"script":{"lang":"painless","source":"def param1 = (doc['price_us'].size() == 0 ? null : doc['price_us'].value); def param2 = (doc['price_eu'].size() == 0 ? null : doc['price_eu'].value); (param1 == null ? param2 : (param2 == null ? param1 : Math.max(param1, param2)))"}}},"_source":true}""" + } + + it should "handle GREATEST 3-arg as script field" in { + val select: ElasticSearchRequest = SelectStatement(greatest3) + val query = select.query + println(query) + query shouldBe + """{"query":{"match_all":{}},"script_fields":{"hi":{"script":{"lang":"painless","source":"def param1 = (doc['price_us'].size() == 0 ? null : doc['price_us'].value); def param2 = (doc['price_eu'].size() == 0 ? null : doc['price_eu'].value); def param3 = (doc['price_uk'].size() == 0 ? null : doc['price_uk'].value); (param1 == null ? (param2 == null ? param3 : (param3 == null ? param2 : Math.max(param2, param3))) : ((param2 == null ? param3 : (param3 == null ? param2 : Math.max(param2, param3))) == null ? param1 : Math.max(param1, (param2 == null ? param3 : (param3 == null ? param2 : Math.max(param2, param3))))))"}}},"_source":{"includes":["sku"]}}""" + } + + it should "handle LEAST 2-arg as script field" in { + val select: ElasticSearchRequest = SelectStatement(least2) + val query = select.query + println(query) + query shouldBe + """{"query":{"match_all":{}},"script_fields":{"lo":{"script":{"lang":"painless","source":"def param1 = (doc['price_us'].size() == 0 ? null : doc['price_us'].value); def param2 = (doc['price_eu'].size() == 0 ? null : doc['price_eu'].value); (param1 == null ? param2 : (param2 == null ? param1 : Math.min(param1, param2)))"}}},"_source":true}""" + } + + it should "handle LEAST 3-arg as script field" in { + val select: ElasticSearchRequest = SelectStatement(least3) + val query = select.query + println(query) + query shouldBe + """{"query":{"match_all":{}},"script_fields":{"lo":{"script":{"lang":"painless","source":"def param1 = (doc['price_us'].size() == 0 ? null : doc['price_us'].value); def param2 = (doc['price_eu'].size() == 0 ? null : doc['price_eu'].value); def param3 = (doc['price_uk'].size() == 0 ? null : doc['price_uk'].value); (param1 == null ? (param2 == null ? param3 : (param3 == null ? param2 : Math.min(param2, param3))) : ((param2 == null ? param3 : (param3 == null ? param2 : Math.min(param2, param3))) == null ? param1 : Math.min(param1, (param2 == null ? param3 : (param3 == null ? param2 : Math.min(param2, param3))))))"}}},"_source":{"includes":["sku"]}}""" + } + it should "handle cast function as script field" in { val select: ElasticSearchRequest = SelectStatement(conversion) @@ -4265,4 +4297,43 @@ class SQLQuerySpec extends AnyFlatSpec with Matchers { } } + + // === Story 14.1: NULLS FIRST / NULLS LAST on ORDER BY === + + it should "emit sort with missing=_last for ORDER BY ... DESC NULLS LAST" in { + val select: ElasticSearchRequest = + SelectStatement("SELECT identifier FROM Table ORDER BY identifier DESC NULLS LAST") + val query = select.query + query should include("\"missing\":\"_last\"") + query should include("\"order\":\"desc\"") + } + + it should "emit sort with missing=_first for ORDER BY ... ASC NULLS FIRST" in { + val select: ElasticSearchRequest = + SelectStatement("SELECT identifier FROM Table ORDER BY identifier ASC NULLS FIRST") + val query = select.query + query should include("\"missing\":\"_first\"") + query should include("\"order\":\"asc\"") + } + + it should "emit per-field null ordering for multi-column ORDER BY" in { + val select: ElasticSearchRequest = + SelectStatement( + "SELECT identifier FROM Table ORDER BY a DESC NULLS LAST, b ASC NULLS FIRST" + ) + val query = select.query + // Bind each missing value to its own sort field so a swapped mapping + // (e.g. NullsFirst -> _last) cannot pass: the missing key must live inside + // the same JSON object as the field it belongs to. + query should include regex """"a":\{[^}]*"missing":"_last"""" + query should include regex """"b":\{[^}]*"missing":"_first"""" + } + + it should "omit the missing field when no null ordering specified" in { + val select: ElasticSearchRequest = + SelectStatement("SELECT identifier FROM Table ORDER BY identifier DESC") + val query = select.query + query should not include "\"missing\"" + } + } diff --git a/es6/jest/src/test/scala/app/softnetwork/elastic/client/JestClientGreatestLeastSpec.scala b/es6/jest/src/test/scala/app/softnetwork/elastic/client/JestClientGreatestLeastSpec.scala new file mode 100644 index 00000000..7d01fad6 --- /dev/null +++ b/es6/jest/src/test/scala/app/softnetwork/elastic/client/JestClientGreatestLeastSpec.scala @@ -0,0 +1,19 @@ +/* + * Copyright 2025 SOFTNETWORK + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package app.softnetwork.elastic.client + +class JestClientGreatestLeastSpec extends GreatestLeastSpec diff --git a/es6/rest/src/test/scala/app/softnetwork/elastic/client/RestHighLevelClientGreatestLeastSpec.scala b/es6/rest/src/test/scala/app/softnetwork/elastic/client/RestHighLevelClientGreatestLeastSpec.scala new file mode 100644 index 00000000..76d6d4c0 --- /dev/null +++ b/es6/rest/src/test/scala/app/softnetwork/elastic/client/RestHighLevelClientGreatestLeastSpec.scala @@ -0,0 +1,19 @@ +/* + * Copyright 2025 SOFTNETWORK + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package app.softnetwork.elastic.client + +class RestHighLevelClientGreatestLeastSpec extends GreatestLeastSpec diff --git a/es7/rest/src/test/scala/app/softnetwork/elastic/client/RestHighLevelClientGreatestLeastSpec.scala b/es7/rest/src/test/scala/app/softnetwork/elastic/client/RestHighLevelClientGreatestLeastSpec.scala new file mode 100644 index 00000000..76d6d4c0 --- /dev/null +++ b/es7/rest/src/test/scala/app/softnetwork/elastic/client/RestHighLevelClientGreatestLeastSpec.scala @@ -0,0 +1,19 @@ +/* + * Copyright 2025 SOFTNETWORK + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package app.softnetwork.elastic.client + +class RestHighLevelClientGreatestLeastSpec extends GreatestLeastSpec diff --git a/es8/java/src/test/scala/app/softnetwork/elastic/client/JavaClientGreatestLeastSpec.scala b/es8/java/src/test/scala/app/softnetwork/elastic/client/JavaClientGreatestLeastSpec.scala new file mode 100644 index 00000000..cb40c362 --- /dev/null +++ b/es8/java/src/test/scala/app/softnetwork/elastic/client/JavaClientGreatestLeastSpec.scala @@ -0,0 +1,19 @@ +/* + * Copyright 2025 SOFTNETWORK + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package app.softnetwork.elastic.client + +class JavaClientGreatestLeastSpec extends GreatestLeastSpec diff --git a/es9/java/src/test/scala/app/softnetwork/elastic/client/JavaClientGreatestLeastSpec.scala b/es9/java/src/test/scala/app/softnetwork/elastic/client/JavaClientGreatestLeastSpec.scala new file mode 100644 index 00000000..cb40c362 --- /dev/null +++ b/es9/java/src/test/scala/app/softnetwork/elastic/client/JavaClientGreatestLeastSpec.scala @@ -0,0 +1,19 @@ +/* + * Copyright 2025 SOFTNETWORK + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package app.softnetwork.elastic.client + +class JavaClientGreatestLeastSpec extends GreatestLeastSpec diff --git a/sql/src/main/scala/app/softnetwork/elastic/sql/function/cond/package.scala b/sql/src/main/scala/app/softnetwork/elastic/sql/function/cond/package.scala index f4df7954..5a4382e3 100644 --- a/sql/src/main/scala/app/softnetwork/elastic/sql/function/cond/package.scala +++ b/sql/src/main/scala/app/softnetwork/elastic/sql/function/cond/package.scala @@ -29,6 +29,7 @@ import app.softnetwork.elastic.sql.{ import app.softnetwork.elastic.sql.`type`.{ SQLAny, SQLBool, + SQLNumeric, SQLTemporal, SQLType, SQLTypeUtils, @@ -47,6 +48,8 @@ package object cond { case object IsNull extends Expr("ISNULL") with ConditionalOp case object IsNotNull extends Expr("ISNOTNULL") with ConditionalOp case object NullIf extends Expr("NULLIF") with ConditionalOp + case object Greatest extends Expr("GREATEST") with ConditionalOp + case object Least extends Expr("LEAST") with ConditionalOp // case object Exists extends Expr("EXISTS") with ConditionalOp case object Case extends Expr("CASE") with ConditionalOp @@ -418,4 +421,124 @@ package object cond { ) } } + + /** N-ary numeric reducer shared by `GREATEST` / `LEAST`. + * + * Emits a right-folded nested ternary so a NULL argument is skipped and the whole expression + * yields NULL only when every argument is NULL: pairwise(x, y) = (x == null ? y : (y == null ? x + * : Math.{max|min}(x, y))) + * + * Output is `SQLNumeric` because `Math.max` / `Math.min` only operate on numeric primitives. The + * base type narrows to the widest input numeric. + */ + sealed trait NumericReducer + extends TransformFunction[SQLAny, SQLNumeric] + with FunctionWithIdentifier { + def values: List[PainlessScript] + def operator: ConditionalOp + protected def mathFn: String // "Math.max" | "Math.min" + + override def fun: Option[ConditionalOp] = Some(operator) + + override def args: List[PainlessScript] = values + + override def outputType: SQLNumeric = SQLTypes.Numeric + + override def identifier: Identifier = Identifier() + + override def inputType: SQLAny = SQLTypes.Any + + override def baseType: SQLType = SQLTypeUtils.leastCommonSuperType(argTypes) match { + case n: SQLNumeric => n + case _ => outputType + } + + override def sql: String = s"$operator(${values.map(_.sql).mkString(", ")})" + + override def checkIfNullable: Boolean = false + + override def validate(): Either[String, Unit] = + if (values.isEmpty) Left(s"$operator requires at least one argument") + else + // Accept numeric args and still-unresolved args (SQLAny / NULL, which + // extends SQLAny): a bare field has no known type until it is resolved + // against the index mapping, so we must not reject it here. Only reject + // args whose type is *definitively* non-numeric (string, temporal, + // boolean, …) — those would emit Math.{max,min}() on a non-numeric and + // fail at ES runtime. + values.find { v => + v.out match { + case _: SQLNumeric => false + case _: SQLAny => false + case _ => true + } + } match { + case Some(nonNumeric) => + Left( + s"$operator requires numeric arguments but got ${nonNumeric.out} for ${nonNumeric.sql}" + ) + case None => Right(()) + } + + override def nullable: Boolean = values.forall(_.nullable) + + override def toPainlessCall( + callArgs: List[String], + context: Option[PainlessContext] + ): String = { + // Pair each rendered arg with its nullability so non-nullable args (e.g. + // numeric literals, which render inline as primitives) are NOT guarded + // with `== null` — Painless rejects ` == null` at compile time. + // Right-fold pairwise: pairwise(a, pairwise(b, pairwise(c, …))). + // The combined sub-tree is itself nullable only when BOTH sides can be + // null, which lets us drop the guard on parent levels too. + def fold(args: List[(String, Boolean)]): (String, Boolean) = + args match { + case Nil => + throw new IllegalArgumentException(s"$operator requires at least one argument") + case (s, n) :: Nil => (s.trim, n) + case (s, xNullable) :: rest => + val x = s.trim + val (y, yNullable) = fold(rest) + val expr = + (xNullable, yNullable) match { + case (true, true) => s"($x == null ? $y : ($y == null ? $x : $mathFn($x, $y)))" + case (true, false) => s"($x == null ? $y : $mathFn($x, $y))" + case (false, true) => s"($y == null ? $x : $mathFn($x, $y))" + case (false, false) => s"$mathFn($x, $y)" + } + // result is null only if every branch can be null + (expr, xNullable && yNullable) + } + + callArgs match { + case Nil => + throw new IllegalArgumentException(s"$operator requires at least one argument") + case x :: Nil => x + case _ => fold(callArgs.zip(values.map(_.nullable)))._1 + } + } + } + + case class Greatest(values: List[PainlessScript]) extends NumericReducer { + override def operator: ConditionalOp = Greatest + override protected def mathFn: String = "Math.max" + + override def update(request: query.SingleSearch): Greatest = + this.copy(values = values.map { + case u: Updateable => u.update(request).asInstanceOf[PainlessScript] + case other => other + }) + } + + case class Least(values: List[PainlessScript]) extends NumericReducer { + override def operator: ConditionalOp = Least + override protected def mathFn: String = "Math.min" + + override def update(request: query.SingleSearch): Least = + this.copy(values = values.map { + case u: Updateable => u.update(request).asInstanceOf[PainlessScript] + case other => other + }) + } } diff --git a/sql/src/main/scala/app/softnetwork/elastic/sql/parser/Parser.scala b/sql/src/main/scala/app/softnetwork/elastic/sql/parser/Parser.scala index b0c3ee84..51b4a7e8 100644 --- a/sql/src/main/scala/app/softnetwork/elastic/sql/parser/Parser.scala +++ b/sql/src/main/scala/app/softnetwork/elastic/sql/parser/Parser.scala @@ -1232,6 +1232,8 @@ trait Parser "nullif", "isnull", "isnotnull", + "greatest", + "least", "date_add", "date_sub", "parse_date", diff --git a/sql/src/main/scala/app/softnetwork/elastic/sql/parser/function/cond/package.scala b/sql/src/main/scala/app/softnetwork/elastic/sql/parser/function/cond/package.scala index e96bbb6f..0d29f666 100644 --- a/sql/src/main/scala/app/softnetwork/elastic/sql/parser/function/cond/package.scala +++ b/sql/src/main/scala/app/softnetwork/elastic/sql/parser/function/cond/package.scala @@ -23,8 +23,10 @@ import app.softnetwork.elastic.sql.function.cond.{ ConditionalFunction, ELSE, END, + Greatest, IsNotNull, IsNull, + Least, NullIf, THEN, WHEN @@ -66,6 +68,16 @@ package object cond { case _ ~ _ ~ id1 ~ _ ~ id2 ~ _ => NullIf(id1, id2) } + def greatest: PackratParser[Greatest] = + Greatest.regex ~ start ~ rep1sep(valueExpr, separator) ~ end ^^ { case _ ~ _ ~ vs ~ _ => + Greatest(vs) + } + + def least: PackratParser[Least] = + Least.regex ~ start ~ rep1sep(valueExpr, separator) ~ end ^^ { case _ ~ _ ~ vs ~ _ => + Least(vs) + } + def start_case: PackratParser[StartCase.type] = Case.regex ^^ (_ => StartCase) def when_case: PackratParser[WhenCase.type] = WHEN.regex ^^ (_ => WhenCase) @@ -100,7 +112,7 @@ package object cond { } def conditional_function: PackratParser[FunctionWithIdentifier] = - is_null | is_notnull | coalesce | nullif + is_null | is_notnull | coalesce | nullif | greatest | least def conditionalFunctionWithIdentifier: PackratParser[Identifier] = conditional_function ^^ { t => diff --git a/sql/src/test/scala/app/softnetwork/elastic/sql/parser/ParserSpec.scala b/sql/src/test/scala/app/softnetwork/elastic/sql/parser/ParserSpec.scala index d6faa351..8017faaf 100644 --- a/sql/src/test/scala/app/softnetwork/elastic/sql/parser/ParserSpec.scala +++ b/sql/src/test/scala/app/softnetwork/elastic/sql/parser/ParserSpec.scala @@ -207,6 +207,16 @@ object Queries { val isNotNullCriteria = "SELECT * FROM Table WHERE ISNOTNULL(identifier)" val coalesce: String = "SELECT COALESCE(createdAt - INTERVAL 35 MINUTE, CURRENT_DATE) AS c, identifier FROM Table" + val greatest2: String = + "SELECT GREATEST(price_us, price_eu) AS hi FROM products" + val greatest3: String = + "SELECT GREATEST(price_us, price_eu, price_uk) AS hi, sku FROM products" + val least2: String = + "SELECT LEAST(price_us, price_eu) AS lo FROM products" + val least3: String = + "SELECT LEAST(price_us, price_eu, price_uk) AS lo, sku FROM products" + val greatestLiteral: String = + "SELECT GREATEST(0, price_us) AS hi FROM products" val nullif: String = "SELECT COALESCE(NULLIF(createdAt, DATE_PARSE('2025-09-11', '%Y-%m-%d') - INTERVAL 2 DAY), CURRENT_DATE) AS c, identifier FROM Table" val conversion: String = @@ -897,6 +907,51 @@ class ParserSpec extends AnyFlatSpec with Matchers { .equalsIgnoreCase(nullif) shouldBe true } + it should "parse GREATEST 2-arg" in { + val result = Parser(greatest2) + result.toOption + .map(_.sql) + .getOrElse("") + .equalsIgnoreCase(greatest2) shouldBe true + } + + it should "parse GREATEST 3-arg" in { + val result = Parser(greatest3) + result.toOption + .map(_.sql) + .getOrElse("") + .equalsIgnoreCase(greatest3) shouldBe true + } + + it should "parse LEAST 2-arg" in { + val result = Parser(least2) + result.toOption + .map(_.sql) + .getOrElse("") + .equalsIgnoreCase(least2) shouldBe true + } + + it should "parse LEAST 3-arg" in { + val result = Parser(least3) + result.toOption + .map(_.sql) + .getOrElse("") + .equalsIgnoreCase(least3) shouldBe true + } + + it should "parse GREATEST with a literal arg" in { + val result = Parser(greatestLiteral) + result.toOption + .map(_.sql) + .getOrElse("") + .equalsIgnoreCase(greatestLiteral) shouldBe true + } + + it should "reject GREATEST with definitively non-numeric args" in { + val result = Parser("SELECT GREATEST('a', 'b') AS hi FROM products") + result.isLeft shouldBe true + } + it should "parse conversion function" in { val result = Parser(conversion) result.toOption diff --git a/testkit/src/main/scala/app/softnetwork/elastic/client/GreatestLeastSpec.scala b/testkit/src/main/scala/app/softnetwork/elastic/client/GreatestLeastSpec.scala new file mode 100644 index 00000000..e46aa830 --- /dev/null +++ b/testkit/src/main/scala/app/softnetwork/elastic/client/GreatestLeastSpec.scala @@ -0,0 +1,148 @@ +/* + * Copyright 2025 SOFTNETWORK + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package app.softnetwork.elastic.client + +import akka.NotUsed +import akka.actor.ActorSystem +import akka.stream.scaladsl.Source +import app.softnetwork.elastic.client.bulk._ +import app.softnetwork.elastic.client.result.{ElasticFailure, ElasticSuccess} +import app.softnetwork.elastic.client.spi.ElasticClientFactory +import app.softnetwork.elastic.scalatest.ElasticDockerTestKit +import app.softnetwork.persistence.generateUUID +import org.scalatest.flatspec.AnyFlatSpecLike +import org.scalatest.matchers.should.Matchers +import org.slf4j.{Logger, LoggerFactory} + +import scala.language.implicitConversions + +/** Integration tests for Story 14.2: `GREATEST` / `LEAST` over numeric columns. Verifies the + * nested-ternary Painless script emitted by the bridge handles ANSI null-skipping semantics + * correctly when executed against a live ES instance. + * + * Fixture: 4 documents in the `greatest_least` index with `(a, b, c)` set to: + * - alpha: 10, 20, 30 — none null, none missing + * - missing_b: 10, _, 30 — `b` field missing + * - null_a: null, 20, 30 — `a` explicitly null + * - all_null: null, null, null — every value null + * + * Both null and missing translate to "no value" in ES; the ternary tree skips both. + */ +trait GreatestLeastSpec extends AnyFlatSpecLike with ElasticDockerTestKit with Matchers { + + lazy val log: Logger = LoggerFactory.getLogger(getClass.getName) + + implicit val system: ActorSystem = ActorSystem(generateUUID()) + + lazy val client: ElasticClientApi = ElasticClientFactory.create(elasticConfig) + + override def beforeAll(): Unit = { + super.beforeAll() + + val mapping = + """{ + | "properties": { + | "name": { "type": "keyword" }, + | "a": { "type": "integer" }, + | "b": { "type": "integer" }, + | "c": { "type": "integer" } + | } + |}""".stripMargin + + client.createIndex("greatest_least", mappings = None, aliases = Nil).get shouldBe true + client.setMapping("greatest_least", mapping).get shouldBe true + + val docs = List( + """{"id":"1","name":"alpha","a":10,"b":20,"c":30}""", + """{"id":"2","name":"missing_b","a":10,"c":30}""", + """{"id":"3","name":"null_a","a":null,"b":20,"c":30}""", + """{"id":"4","name":"all_null","a":null,"b":null,"c":null}""" + ) + + implicit val bulkOptions: BulkOptions = BulkOptions( + defaultIndex = "greatest_least", + logEvery = 5 + ) + + implicit def listToSource[T](list: List[T]): Source[T, NotUsed] = + Source.fromIterator(() => list.iterator) + + client.bulk[String](docs, identity, idKey = Some(Set("id"))) match { + case ElasticSuccess(_) => // ok + case ElasticFailure(error) => + error.cause.foreach(_.printStackTrace()) + fail(s"Bulk indexing failed: ${error.message}") + } + + client.refresh("greatest_least") + } + + override def afterAll(): Unit = { + client.deleteIndex("greatest_least") + super.afterAll() + } + + // ES script_fields wraps every value in an array (even scalars). The case class + // binds `g`/`l` as `Option[List[java.lang.Integer]]` to faithfully reflect that + // wire shape (a single-element array, or `[null]` when the Painless script + // collapses to null). The helper below normalizes that to a plain `Option[Int]` + // so the row assertions stay readable. + private def scalar(v: Option[List[java.lang.Integer]]): Option[Int] = + v.flatMap(_.headOption).flatMap(Option(_)).map(_.intValue) + + "GREATEST(a, b, c)" should "return the largest non-null value across three columns" in { + val results = client.searchAs[GreatestLeastRow]( + "SELECT name, GREATEST(a, b, c) AS g FROM greatest_least" + ) + + results match { + case ElasticSuccess(rows) => + rows should have size 4 + val byName = rows.map(r => r.name -> scalar(r.g)).toMap + byName("alpha") shouldBe Some(30) + byName("missing_b") shouldBe Some(30) + byName("null_a") shouldBe Some(30) + byName("all_null") shouldBe None + log.info(s"GREATEST result: $byName") + + case ElasticFailure(error) => fail(s"Query failed: ${error.message}") + } + } + + "LEAST(a, b, c)" should "return the smallest non-null value across three columns" in { + val results = client.searchAs[GreatestLeastRowLeast]( + "SELECT name, LEAST(a, b, c) AS l FROM greatest_least" + ) + + results match { + case ElasticSuccess(rows) => + rows should have size 4 + val byName = rows.map(r => r.name -> scalar(r.l)).toMap + byName("alpha") shouldBe Some(10) + byName("missing_b") shouldBe Some(10) + byName("null_a") shouldBe Some(20) + byName("all_null") shouldBe None + log.info(s"LEAST result: $byName") + + case ElasticFailure(error) => fail(s"Query failed: ${error.message}") + } + } + +} + +case class GreatestLeastRow(name: String, g: Option[List[java.lang.Integer]] = None) +case class GreatestLeastRowLeast(name: String, l: Option[List[java.lang.Integer]] = None) From 9c4e1143edeee708ec381024f1458fdac4960c45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Manciot?= Date: Fri, 5 Jun 2026 19:48:38 +0200 Subject: [PATCH 3/6] feat(sql): add ROW_NUMBER / RANK / DENSE_RANK ranking window functions ANSI ranking windows over the existing top_hits enrichment path: - ROW_NUMBER / RANK / DENSE_RANK with required ORDER BY inside OVER, optional PARTITION BY - rank ordinals computed Scala-side in searchWithWindowEnrichment and injected per base-query row via (partitionKey, _id) lookup - top-N push-down via inline LIMIT N inside OVER -> top_hits.size = N (default cap = index.max_inner_result_window, 100) Review fixes: outer LIMIT no longer shrinks the window, LIMIT 0/negative guarded, dotted ORDER BY field fallback, null enrichment under SELECT alias. Closed Issue #101 Co-Authored-By: Claude Opus 4.8 (1M context) --- .../sql/bridge/ElasticAggregation.scala | 40 +++- .../elastic/sql/SQLQuerySpec.scala | 40 ++++ .../help/functions/aggregate/_index.json | 5 +- .../help/functions/aggregate/dense_rank.json | 32 +++ .../help/functions/aggregate/rank.json | 32 +++ .../help/functions/aggregate/row_number.json | 31 +++ .../elastic/client/ElasticConversion.scala | 41 ++-- .../elastic/client/SearchApi.scala | 130 +++++++++++-- .../softnetwork/elastic/client/package.scala | 24 ++- documentation/sql/dql_statements.md | 39 ++++ .../sql/bridge/ElasticAggregation.scala | 35 +++- .../elastic/sql/SQLQuerySpec.scala | 40 ++++ .../sql/function/aggregate/package.scala | 156 ++++++++++++++- .../elastic/sql/parser/Parser.scala | 3 + .../parser/function/aggregate/package.scala | 30 ++- .../elastic/sql/parser/ParserSpec.scala | 57 ++++++ .../elastic/client/WindowFunctionSpec.scala | 183 ++++++++++++++++-- .../elastic/model/window/package.scala | 9 +- 18 files changed, 857 insertions(+), 70 deletions(-) create mode 100644 core/src/main/resources/help/functions/aggregate/dense_rank.json create mode 100644 core/src/main/resources/help/functions/aggregate/rank.json create mode 100644 core/src/main/resources/help/functions/aggregate/row_number.json diff --git a/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/ElasticAggregation.scala b/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/ElasticAggregation.scala index 56dc7701..ae94f756 100644 --- a/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/ElasticAggregation.scala +++ b/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/ElasticAggregation.scala @@ -213,23 +213,43 @@ object ElasticAggregation { case SUM => aggWithFieldOrScript(sumAgg, (name, s) => sumAgg(name, sourceField).script(s)) case _ => + val isRanking = th.isInstanceOf[RankingWindow] val limit = { th match { case _: LastValue | _: FirstValue => Some(1) - case _ => th.limit.map(_.limit) + // Ranking: top_hits.size driven by the AST's `limit`, + // populated by the inline `LIMIT N` inside OVER (the shipped + // top-N push-down syntax). When absent, default to ES + // `index.max_inner_result_window` (100); push the desired N + // via `LIMIT N` inside OVER for larger partitions. A + // non-positive LIMIT is meaningless for top-N, so it falls + // back to the default cap rather than emitting size:0. + case _: RankingWindow => + Some(th.limit.map(_.limit).filter(_ > 0).getOrElse(100)) + case _ => th.limit.map(_.limit) } } + // Ranking emits fetchSource = only the ORDER BY columns (used + // by the in-memory ordinal assigner to detect ties); `_id` + // comes back automatically as hit metadata. The aggregation + // window (LAST_VALUE / FIRST_VALUE / ARRAY_AGG) keeps the + // existing identifier-name-based fetchSource. + val fetchSourceCols: Array[String] = + if (isRanking) { + th.orderBy.toSeq + .flatMap(_.sorts.map(_.field.name)) + .distinct + .toArray + } else { + (th.identifier.name +: th.fields + .filterNot(_.isScriptField) + .filterNot(_.sourceField == th.identifier.name) + .map(_.sourceField) + .distinct).toArray + } val topHits = topHitsAgg(aggName) - .fetchSource( - th.identifier.name +: th.fields - .filterNot(_.isScriptField) - .filterNot(_.sourceField == th.identifier.name) - .map(_.sourceField) - .distinct - .toArray, - Array.empty - ) + .fetchSource(fetchSourceCols, Array.empty) .copy( scripts = th.fields .filter(_.isScriptField) diff --git a/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala b/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala index f71f71c6..92408899 100644 --- a/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala +++ b/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala @@ -2096,6 +2096,46 @@ class SQLQuerySpec extends AnyFlatSpec with Matchers { .replaceAll(",ZoneId.of", ", ZoneId.of") } + it should "emit top_hits for ROW_NUMBER per-department" in { + val select: ElasticSearchRequest = SelectStatement(rowNumber) + val query = select.query + println(query) + query shouldBe + """{"query":{"match_all":{}},"size":0,"_source":false,"aggs":{"department":{"terms":{"field":"department","min_doc_count":1},"aggs":{"rn":{"top_hits":{"size":100,"sort":[{"salary":{"order":"desc"}}],"_source":{"includes":["salary"]}}}}}}}""" + } + + it should "emit top_hits for ROW_NUMBER without PARTITION BY" in { + val select: ElasticSearchRequest = SelectStatement(rowNumberNoPartition) + val query = select.query + println(query) + query shouldBe + """{"query":{"match_all":{}},"size":0,"_source":false,"aggs":{"rn":{"top_hits":{"size":100,"sort":[{"salary":{"order":"desc"}}],"_source":{"includes":["salary"]}}}}}""" + } + + it should "emit top_hits for RANK per-department" in { + val select: ElasticSearchRequest = SelectStatement(rankSql) + val query = select.query + println(query) + query shouldBe + """{"query":{"match_all":{}},"size":0,"_source":false,"aggs":{"department":{"terms":{"field":"department","min_doc_count":1},"aggs":{"r":{"top_hits":{"size":100,"sort":[{"salary":{"order":"desc"}}],"_source":{"includes":["salary"]}}}}}}}""" + } + + it should "emit top_hits for DENSE_RANK per-department" in { + val select: ElasticSearchRequest = SelectStatement(denseRank) + val query = select.query + println(query) + query shouldBe + """{"query":{"match_all":{}},"size":0,"_source":false,"aggs":{"department":{"terms":{"field":"department","min_doc_count":1},"aggs":{"dr":{"top_hits":{"size":100,"sort":[{"salary":{"order":"desc"}}],"_source":{"includes":["salary"]}}}}}}}""" + } + + it should "push LIMIT N inside OVER into top_hits.size for ranking (top-N per group)" in { + val select: ElasticSearchRequest = SelectStatement(rankTopN) + val query = select.query + println(query) + query shouldBe + """{"query":{"match_all":{}},"size":0,"_source":false,"aggs":{"department":{"terms":{"field":"department","min_doc_count":1},"aggs":{"r":{"top_hits":{"size":3,"sort":[{"salary":{"order":"desc"}}],"_source":{"includes":["salary"]}}}}}}}""" + } + it should "handle GREATEST 2-arg as script field" in { val select: ElasticSearchRequest = SelectStatement(greatest2) val query = select.query diff --git a/core/src/main/resources/help/functions/aggregate/_index.json b/core/src/main/resources/help/functions/aggregate/_index.json index 788dd3dc..7fe4fd51 100644 --- a/core/src/main/resources/help/functions/aggregate/_index.json +++ b/core/src/main/resources/help/functions/aggregate/_index.json @@ -6,5 +6,8 @@ "max.json", "array_agg.json", "first_value.json", - "last_value.json" + "last_value.json", + "row_number.json", + "rank.json", + "dense_rank.json" ] diff --git a/core/src/main/resources/help/functions/aggregate/dense_rank.json b/core/src/main/resources/help/functions/aggregate/dense_rank.json new file mode 100644 index 00000000..f4ad7b1d --- /dev/null +++ b/core/src/main/resources/help/functions/aggregate/dense_rank.json @@ -0,0 +1,32 @@ +{ + "name": "DENSE_RANK", + "category": "Aggregate", + "shortDescription": "Assign a rank within a partition; ties share a rank and the next rank does NOT skip", + "syntax": [ + "DENSE_RANK() OVER ([PARTITION BY ...] ORDER BY ...)" + ], + "description": "Ranking window function. Returns rank values (1, 2, 2, 3, ...) within each partition, ordered by the OVER ORDER BY clause. Ties share the same rank; the next rank does NOT skip — ranks remain dense (consecutive integers).", + "parameters": [], + "returnType": "BIGINT", + "examples": [ + { + "title": "Per-department dense rank by salary", + "description": "Salaries 100, 90, 90, 80 receive ranks 1, 2, 2, 3 (no gap)", + "sql": "SELECT name, salary,\n DENSE_RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS dr\nFROM emp" + }, + { + "title": "Top-N per group via inline LIMIT", + "description": "Add LIMIT N inside the OVER clause: N is pushed into the ES top_hits sub-aggregation so only the top-N rows per partition are ranked and returned.", + "sql": "SELECT name, salary,\n DENSE_RANK() OVER (PARTITION BY department ORDER BY salary DESC LIMIT 3) AS dr\nFROM emp" + } + ], + "notes": [ + "ORDER BY is REQUIRED inside OVER (ANSI semantics)", + "PARTITION BY is optional — when absent, the entire result set is one partition", + "Ties share a rank; consecutive ranks remain dense (1, 2, 2, 3, ...)", + "Use RANK if you want the next rank to skip ties (1, 2, 2, 4, ...)", + "Top-N push-down: LIMIT N inside OVER sets the ES top_hits.size parameter (top-N rows per partition)" + ], + "seeAlso": ["RANK", "ROW_NUMBER"], + "aliases": [] +} diff --git a/core/src/main/resources/help/functions/aggregate/rank.json b/core/src/main/resources/help/functions/aggregate/rank.json new file mode 100644 index 00000000..5b0486a2 --- /dev/null +++ b/core/src/main/resources/help/functions/aggregate/rank.json @@ -0,0 +1,32 @@ +{ + "name": "RANK", + "category": "Aggregate", + "shortDescription": "Assign a rank within a partition; ties share a rank and the next rank skips", + "syntax": [ + "RANK() OVER ([PARTITION BY ...] ORDER BY ...)" + ], + "description": "Ranking window function. Returns rank values (1, 2, 2, 4, ...) within each partition, ordered by the OVER ORDER BY clause. Ties share the same rank; the next rank skips to account for the tie count.", + "parameters": [], + "returnType": "BIGINT", + "examples": [ + { + "title": "Per-department rank by salary", + "description": "Salaries 100, 90, 90, 80 receive ranks 1, 2, 2, 4", + "sql": "SELECT name, salary,\n RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS r\nFROM emp" + }, + { + "title": "Top-N per group via inline LIMIT", + "description": "Add LIMIT N inside the OVER clause: N is pushed into the ES top_hits sub-aggregation so only the top-N rows per partition are ranked and returned.", + "sql": "SELECT name, salary,\n RANK() OVER (PARTITION BY department ORDER BY salary DESC LIMIT 3) AS r\nFROM emp" + } + ], + "notes": [ + "ORDER BY is REQUIRED inside OVER (ANSI semantics)", + "PARTITION BY is optional — when absent, the entire result set is one partition", + "Ties share a rank; the next rank skips (1, 2, 2, 4, 5, ...)", + "Use DENSE_RANK if you want ties to share a rank without the next rank skipping", + "Top-N push-down: LIMIT N inside OVER sets the ES top_hits.size parameter (top-N rows per partition)" + ], + "seeAlso": ["DENSE_RANK", "ROW_NUMBER"], + "aliases": [] +} diff --git a/core/src/main/resources/help/functions/aggregate/row_number.json b/core/src/main/resources/help/functions/aggregate/row_number.json new file mode 100644 index 00000000..b3ebd261 --- /dev/null +++ b/core/src/main/resources/help/functions/aggregate/row_number.json @@ -0,0 +1,31 @@ +{ + "name": "ROW_NUMBER", + "category": "Aggregate", + "shortDescription": "Assign a sequential 1-based ordinal to each row within a partition", + "syntax": [ + "ROW_NUMBER() OVER ([PARTITION BY ...] ORDER BY ...)" + ], + "description": "Ranking window function. Returns sequential row numbers (1, 2, 3, ...) within each partition, ordered by the OVER ORDER BY clause. No ties are recognized — two rows with the same sort value still receive distinct ordinals.", + "parameters": [], + "returnType": "BIGINT", + "examples": [ + { + "title": "Per-department row number by salary", + "description": "1, 2, 3 ... within each department, ordered by salary DESC", + "sql": "SELECT name, salary,\n ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) AS rn\nFROM emp" + }, + { + "title": "Top-N per group via inline LIMIT", + "description": "Add LIMIT N inside the OVER clause: N is pushed into the ES top_hits sub-aggregation so only the top-N rows per partition are ranked and returned.", + "sql": "SELECT name, salary,\n ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC LIMIT 3) AS rn\nFROM emp" + } + ], + "notes": [ + "ORDER BY is REQUIRED inside OVER (ANSI semantics)", + "PARTITION BY is optional — when absent, the entire result set is one partition", + "No-ties semantics: distinct ordinals even when sort values are equal", + "Top-N push-down: LIMIT N inside OVER sets the ES top_hits.size parameter (top-N rows per partition)" + ], + "seeAlso": ["RANK", "DENSE_RANK"], + "aliases": [] +} diff --git a/core/src/main/scala/app/softnetwork/elastic/client/ElasticConversion.scala b/core/src/main/scala/app/softnetwork/elastic/client/ElasticConversion.scala index c32fa8ae..7c25705e 100644 --- a/core/src/main/scala/app/softnetwork/elastic/client/ElasticConversion.scala +++ b/core/src/main/scala/app/softnetwork/elastic/client/ElasticConversion.scala @@ -701,24 +701,35 @@ trait ElasticConversion { case _ => } + // Ranking windows need every hit's `_id` for the per-row ordinal + // lookup in `SearchApi.enrichDocumentWithWindowValues`. Skip the + // "single-source-field → return the scalar" shortcut so the + // per-hit map preserves both the ORDER BY columns AND the metadata. + val isRanking = agg.exists(_.ranking) val processedHits = hits.map { hit => val source = extractSource(hit, fieldAliases) if (hasMultipleValues) { - source.size match { - case 0 => null - case 1 => - // If only one field in source and multivalued, return the value directly - val value = source.head._2 - value match { - case list: List[_] => list - case map: Map[_, _] => map - case other => other - } - case _ => - // Multiple fields: return as object - val metadata = extractHitMetadata(hit) - val innerHits = extractInnerHits(hit, fieldAliases) - source ++ metadata ++ innerHits + if (isRanking) { + val metadata = extractHitMetadata(hit) + val innerHits = extractInnerHits(hit, fieldAliases) + source ++ metadata ++ innerHits + } else { + source.size match { + case 0 => null + case 1 => + // If only one field in source and multivalued, return the value directly + val value = source.head._2 + value match { + case list: List[_] => list + case map: Map[_, _] => map + case other => other + } + case _ => + // Multiple fields: return as object + val metadata = extractHitMetadata(hit) + val innerHits = extractInnerHits(hit, fieldAliases) + source ++ metadata ++ innerHits + } } } else { val metadata = extractHitMetadata(hit) diff --git a/core/src/main/scala/app/softnetwork/elastic/client/SearchApi.scala b/core/src/main/scala/app/softnetwork/elastic/client/SearchApi.scala index 34fb18c4..6570a163 100644 --- a/core/src/main/scala/app/softnetwork/elastic/client/SearchApi.scala +++ b/core/src/main/scala/app/softnetwork/elastic/client/SearchApi.scala @@ -23,6 +23,7 @@ import app.softnetwork.elastic.client.result.{ ElasticSuccess } import app.softnetwork.elastic.sql.PainlessContextType +import app.softnetwork.elastic.sql.function.aggregate.RankingWindow import app.softnetwork.elastic.sql.macros.SQLQueryMacros import app.softnetwork.elastic.sql.query.{ MultiSearch, @@ -1353,17 +1354,83 @@ trait SearchApi extends ElasticConversion with ElasticClientHelpers { logger.info(s"✅ Parsed ${aggRows.size} aggregation buckets") - // Build cache: partition key -> window values + // Ranking-style windows in the original request, paired with their + // SELECT-field alias (which is the key under which the top_hits + // sub-aggregation surfaces in the parsed agg row). Ranking windows have + // an empty positional identifier, so we can't recover the alias from + // the AST alone — pull it from the field that wraps them. + val rankingWindows: Seq[(String, RankingWindow)] = + request.windowFields.flatMap { f => + f.identifier.windows.collect { case r: RankingWindow => + f.fieldAlias.map(_.alias).getOrElse(f.sourceField) -> r + } + } + val cache = aggRows.map { row => val partitionKey = extractPartitionKey(row, request) val windowValues = extractWindowValues(row, response.aggregations) - - partitionKey -> windowValues + val rankings = extractRankings(row, rankingWindows) + partitionKey -> windowValues.copy(rankings = rankings) } ElasticResult.success(WindowCache(ListMap(cache: _*))) } + /** Read each ranking window's top_hits results from the parsed aggregation row, compute ordinals + * via the window's `assignOrdinals` (per its tie rule), and return a `fieldAlias -> (rowId -> + * rank)` map per partition. + * + * Each `(name, rw)` pair carries the SELECT-field alias (the key under which the top_hits + * sub-agg surfaces in `row`) and the ranking window itself. + */ + /** Resolve an OVER ORDER BY column value from a top_hits inner-source map. The inner-source map + * keeps nested objects un-flattened, so a dotted column (e.g. `address.salary`) is not a + * top-level key. Fall back to walking the dotted path into nested maps. This is purely additive + * — a flat column hits the direct lookup and behaves exactly as before. + */ + private def resolveSortKey(h: Map[String, Any], col: String): Any = + h.get(col) match { + case Some(v) => v + case None => + col + .split('.') + .foldLeft(Option[Any](h)) { + case (Some(m: Map[_, _]), part) => + m.asInstanceOf[Map[String, Any]].get(part) + case _ => None + } + .orNull + } + + private def extractRankings( + row: ListMap[String, Any], + rankingWindows: Seq[(String, RankingWindow)] + ): Map[String, Map[String, Long]] = { + if (rankingWindows.isEmpty) Map.empty + else { + rankingWindows.flatMap { case (name, rw) => + val orderByCols: Seq[String] = + rw.orderBy.toSeq.flatMap(_.sorts.map(_.field.name)) + val hits: Seq[Map[String, Any]] = row.get(name) match { + case Some(l: List[_]) => + l.collect { case m: Map[_, _] => + m.asInstanceOf[Map[String, Any]] + } + case _ => Seq.empty + } + if (hits.isEmpty) None + else { + val ordered: Seq[(String, Seq[Any])] = hits.map { h => + val rowId = h.getOrElse("_id", "").toString + val key = orderByCols.map(c => resolveSortKey(h, c)) + rowId -> key + } + Some(name -> rw.assignOrdinals(ordered).toMap) + } + }.toMap + } + } + // ======================================================================== // BASE QUERY EXECUTION // ======================================================================== @@ -1484,22 +1551,52 @@ trait SearchApi extends ElasticConversion with ElasticClientHelpers { // Build partition key from document val partitionKey = extractPartitionKey(doc, request) + val rowId = doc.get("_id").map(_.toString).getOrElse("") + + val rankingAliases: Seq[String] = + request.windowFields.flatMap { f => + f.identifier.windows.collect { case _: RankingWindow => + f.fieldAlias.map(_.alias).getOrElse(f.sourceField) + } + } // Lookup window values cache.get(partitionKey) match { case Some(windowValues) => - // Merge document with window values - doc ++ windowValues.values + // Aggregation-style windows: merge the per-partition scalars. + val withScalars = doc ++ windowValues.values + + // Ranking-style windows: look up the ordinal by row _id and inject + // it under the SELECT-field alias. Rows that the top_hits sub-agg + // didn't return (e.g. when the LIMIT push-down kept only top-N per + // partition) receive null. + if (rankingAliases.isEmpty) withScalars + else { + val rankEntries = rankingAliases.map { name => + val value = windowValues.rankings + .get(name) + .flatMap(_.get(rowId)) + .map(Long.box(_): Any) + .orNull + name -> value + } + withScalars ++ ListMap(rankEntries: _*) + } case None => logger.warn(s"⚠️ No window values found for partition: ${partitionKey.values}") - // Add null values for missing window functions - val nullValues = request.windowFunctions.map { wf => - wf.identifier.aliasOrName -> null + // Add null values for missing window functions. Aggregation-style + // windows key off their own alias/name; ranking windows have an empty + // positional identifier, so their null must be injected under the + // SELECT-field alias (mirrors the Some-branch). + val aggNulls = request.windowFunctions.collect { + case wf if !wf.isInstanceOf[RankingWindow] => + wf.identifier.aliasOrName -> (null: Any) } + val rankingNulls = rankingAliases.map(_ -> (null: Any)) - doc ++ ListMap(nullValues: _*) + doc ++ ListMap(aggNulls: _*) ++ ListMap(rankingNulls: _*) } } @@ -1517,9 +1614,19 @@ trait SearchApi extends ElasticConversion with ElasticClientHelpers { } } - /** Window function values for a partition + /** Window function values for a partition. + * + * `values` carries the existing per-partition scalars (aggregation-style windows: + * SUM/COUNT/MIN/MAX/AVG, plus FIRST_VALUE/LAST_VALUE/ARRAY_AGG). + * + * `rankings` carries the per-row ordinals computed Scala-side from each ranking window's + * top_hits sub-aggregation: a map `windowFunction.aliasOrName → (rowId → rank)`. The base-row + * enrichment step looks up the ordinal by `doc._id` for each ranking window. */ - protected case class WindowValues(values: ListMap[String, Any]) + protected case class WindowValues( + values: ListMap[String, Any], + rankings: Map[String, Map[String, Long]] = Map.empty + ) /** Cache of partition key -> window values */ @@ -1527,4 +1634,5 @@ trait SearchApi extends ElasticConversion with ElasticClientHelpers { def get(key: PartitionKey): Option[WindowValues] = cache.get(key) def size: Int = cache.size } + } diff --git a/core/src/main/scala/app/softnetwork/elastic/client/package.scala b/core/src/main/scala/app/softnetwork/elastic/client/package.scala index b1e1ff6b..94bfb778 100644 --- a/core/src/main/scala/app/softnetwork/elastic/client/package.scala +++ b/core/src/main/scala/app/softnetwork/elastic/client/package.scala @@ -339,7 +339,12 @@ package object client extends SerializationApi { */ object AggregationType extends Enumeration { type AggregationType = Value - val Count, Min, Max, Avg, Sum, FirstValue, LastValue, ArrayAgg = Value + val Count, Min, Max, Avg, Sum, FirstValue, LastValue, ArrayAgg, + // Ranking-style window functions. Each top_hits hit gets a per-row + // ordinal computed Scala-side by the searchWithWindowEnrichment + // pipeline (RankingKind in function.aggregate); the ordinal is then + // injected into the base-query row by (partitionKey, _id) lookup. + RowNumber, Rank, DenseRank = Value } /** Client Aggregation @@ -367,8 +372,20 @@ package object client extends SerializationApi { bucketRoot: String, auxiliary: Boolean = false ) { - def multivalued: Boolean = aggType == AggregationType.ArrayAgg + def multivalued: Boolean = + aggType == AggregationType.ArrayAgg || + // Ranking windows return a per-row stream from the underlying + // top_hits sub-aggregation; the enrichment pipeline consumes the + // list to compute ordinals (Scala-side) and look them up by _id. + aggType == AggregationType.RowNumber || + aggType == AggregationType.Rank || + aggType == AggregationType.DenseRank def singleValued: Boolean = !multivalued + + def ranking: Boolean = + aggType == AggregationType.RowNumber || + aggType == AggregationType.Rank || + aggType == AggregationType.DenseRank } implicit def sqlAggregationToClientAggregation(agg: SQLAggregation): ClientAggregation = { @@ -386,6 +403,9 @@ package object client extends SerializationApi { case _: MaxAgg => AggregationType.Max case _: AvgAgg => AggregationType.Avg case _: SumAgg => AggregationType.Sum + case _: RowNumber => AggregationType.RowNumber + case _: Ranking => AggregationType.Rank + case _: DenseRank => AggregationType.DenseRank case _ => throw new IllegalArgumentException(s"Unsupported aggregation type: ${agg.aggType}") } ClientAggregation( diff --git a/documentation/sql/dql_statements.md b/documentation/sql/dql_statements.md index fbc5eb99..c05cde20 100644 --- a/documentation/sql/dql_statements.md +++ b/documentation/sql/dql_statements.md @@ -369,6 +369,9 @@ Supported window functions include: - `FIRST_VALUE(expr) OVER (...)` - `LAST_VALUE(expr) OVER (...)` - `ARRAY_AGG(expr) OVER (...)` +- `ROW_NUMBER() OVER ([PARTITION BY ...] ORDER BY ...)` +- `RANK() OVER ([PARTITION BY ...] ORDER BY ...)` +- `DENSE_RANK() OVER ([PARTITION BY ...] ORDER BY ...)` #### Basic window example @@ -383,6 +386,42 @@ FROM dql_sales ORDER BY product, ts; ``` +#### ROW_NUMBER / RANK / DENSE_RANK (ranking windows) + +`ORDER BY` is REQUIRED inside `OVER` for ranking functions (ANSI). `PARTITION BY` +is optional — when absent, the entire result set is treated as one partition. + +```sql +SELECT name, salary, + ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) AS rn, + RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS r, + DENSE_RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS dr +FROM emp; +``` + +Tie semantics: + +- `ROW_NUMBER` — sequential within partition; no ties recognized (1, 2, 3, 4, …) +- `RANK` — ties share rank, next rank skips (1, 2, 2, 4, …) +- `DENSE_RANK` — ties share rank, next rank does NOT skip (1, 2, 2, 3, …) + +##### Top-N per group (push-down via `LIMIT` inside `OVER`) + +Inline `LIMIT N` inside the OVER clause to limit the number of rows ranked per +partition. The engine pushes `N` down to the underlying Elasticsearch +`top_hits.size` parameter so only the top-N rows per partition are +materialised: + +```sql +SELECT name, salary, + RANK() OVER (PARTITION BY department ORDER BY salary DESC LIMIT 3) AS r +FROM emp; +``` + +Without an explicit `LIMIT`, `top_hits.size` defaults to 100 — the +Elasticsearch `index.max_inner_result_window` default. For larger partitions +either supply `LIMIT N` inline or raise the index setting. + #### FIRST_VALUE / LAST_VALUE / ARRAY_AGG ```sql diff --git a/es6/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/ElasticAggregation.scala b/es6/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/ElasticAggregation.scala index 8dcc0119..4174617b 100644 --- a/es6/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/ElasticAggregation.scala +++ b/es6/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/ElasticAggregation.scala @@ -214,23 +214,38 @@ object ElasticAggregation { case SUM => aggWithFieldOrScript(sumAgg, (name, s) => sumAgg(name, sourceField).script(s)) case _ => + val isRanking = th.isInstanceOf[RankingWindow] val limit = { th match { case _: LastValue | _: FirstValue => Some(1) - case _ => th.limit.map(_.limit) + // Ranking: top_hits.size driven by the AST's `limit`, + // populated by the inline `LIMIT N` inside OVER (the shipped + // top-N push-down syntax). When absent, default to ES + // `index.max_inner_result_window` (100); push the desired N + // via `LIMIT N` inside OVER for larger partitions. A + // non-positive LIMIT is meaningless for top-N, so it falls + // back to the default cap rather than emitting size:0. + case _: RankingWindow => + Some(th.limit.map(_.limit).filter(_ > 0).getOrElse(100)) + case _ => th.limit.map(_.limit) } } + val fetchSourceCols: Array[String] = + if (isRanking) { + th.orderBy.toSeq + .flatMap(_.sorts.map(_.field.name)) + .distinct + .toArray + } else { + (th.identifier.name +: th.fields + .filterNot(_.isScriptField) + .filterNot(_.sourceField == th.identifier.name) + .map(_.sourceField) + .distinct).toArray + } val topHits = topHitsAgg(aggName) - .fetchSource( - th.identifier.name +: th.fields - .filterNot(_.isScriptField) - .filterNot(_.sourceField == th.identifier.name) - .map(_.sourceField) - .distinct - .toArray, - Array.empty - ) + .fetchSource(fetchSourceCols, Array.empty) .copy( scripts = th.fields .filter(_.isScriptField) diff --git a/es6/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala b/es6/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala index 7d79c69c..6715b010 100644 --- a/es6/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala +++ b/es6/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala @@ -2096,6 +2096,46 @@ class SQLQuerySpec extends AnyFlatSpec with Matchers { .replaceAll(",ZoneId.of", ", ZoneId.of") } + it should "emit top_hits for ROW_NUMBER per-department" in { + val select: ElasticSearchRequest = SelectStatement(rowNumber) + val query = select.query + println(query) + query shouldBe + """{"query":{"match_all":{}},"size":0,"_source":false,"aggs":{"department":{"terms":{"field":"department","min_doc_count":1},"aggs":{"rn":{"top_hits":{"size":100,"sort":[{"salary":{"order":"desc"}}],"_source":{"includes":["salary"]}}}}}}}""" + } + + it should "emit top_hits for ROW_NUMBER without PARTITION BY" in { + val select: ElasticSearchRequest = SelectStatement(rowNumberNoPartition) + val query = select.query + println(query) + query shouldBe + """{"query":{"match_all":{}},"size":0,"_source":false,"aggs":{"rn":{"top_hits":{"size":100,"sort":[{"salary":{"order":"desc"}}],"_source":{"includes":["salary"]}}}}}""" + } + + it should "emit top_hits for RANK per-department" in { + val select: ElasticSearchRequest = SelectStatement(rankSql) + val query = select.query + println(query) + query shouldBe + """{"query":{"match_all":{}},"size":0,"_source":false,"aggs":{"department":{"terms":{"field":"department","min_doc_count":1},"aggs":{"r":{"top_hits":{"size":100,"sort":[{"salary":{"order":"desc"}}],"_source":{"includes":["salary"]}}}}}}}""" + } + + it should "emit top_hits for DENSE_RANK per-department" in { + val select: ElasticSearchRequest = SelectStatement(denseRank) + val query = select.query + println(query) + query shouldBe + """{"query":{"match_all":{}},"size":0,"_source":false,"aggs":{"department":{"terms":{"field":"department","min_doc_count":1},"aggs":{"dr":{"top_hits":{"size":100,"sort":[{"salary":{"order":"desc"}}],"_source":{"includes":["salary"]}}}}}}}""" + } + + it should "push LIMIT N inside OVER into top_hits.size for ranking (top-N per group)" in { + val select: ElasticSearchRequest = SelectStatement(rankTopN) + val query = select.query + println(query) + query shouldBe + """{"query":{"match_all":{}},"size":0,"_source":false,"aggs":{"department":{"terms":{"field":"department","min_doc_count":1},"aggs":{"r":{"top_hits":{"size":3,"sort":[{"salary":{"order":"desc"}}],"_source":{"includes":["salary"]}}}}}}}""" + } + it should "handle GREATEST 2-arg as script field" in { val select: ElasticSearchRequest = SelectStatement(greatest2) val query = select.query diff --git a/sql/src/main/scala/app/softnetwork/elastic/sql/function/aggregate/package.scala b/sql/src/main/scala/app/softnetwork/elastic/sql/function/aggregate/package.scala index fcf51616..0abc9710 100644 --- a/sql/src/main/scala/app/softnetwork/elastic/sql/function/aggregate/package.scala +++ b/sql/src/main/scala/app/softnetwork/elastic/sql/function/aggregate/package.scala @@ -63,6 +63,10 @@ package object aggregate { override val words: List[String] = List(sql, "ARRAY") } + case object ROW_NUMBER extends Expr("ROW_NUMBER") with Window + case object RANK extends Expr("RANK") with Window + case object DENSE_RANK extends Expr("DENSE_RANK") with Window + case object OVER extends Expr("OVER") with TokenRegex case object PARTITION_BY extends Expr("PARTITION BY") with TokenRegex @@ -124,15 +128,31 @@ package object aggregate { b.identifier.identifierName -> b }.toMap + /** Window subclasses that should emit `LIMIT N` inside their OVER clause when round-tripping to + * SQL. Defaults to false so existing windows (FIRST_VALUE / LAST_VALUE / ARRAY_AGG / + * aggregate-style) keep the bare round-trip; ranking windows override to true so the push-down + * syntax is preserved. + */ + protected def emitsLimitInOver: Boolean = false + override def sql: String = { (partitionBy, orderBy) match { case (Nil, None) => s"$window($identifier)" - case _ => - val orderByStr = orderBy.map(_.sql).getOrElse("") + case _ => + // OrderBy.sql carries a leading space — strip it when there is + // no PARTITION BY ahead so the OVER clause does not start with + // `OVER ( ORDER BY ...)`. + val orderByStr = + orderBy + .map(_.sql) + .map(s => if (partitionBy.isEmpty) s.stripPrefix(" ") else s) + .getOrElse("") val partitionByStr = if (partitionBy.nonEmpty) s"$PARTITION_BY ${partitionBy.mkString(", ")}" else "" - s"$window($identifier) $OVER ($partitionByStr$orderByStr)" + val limitStr = + if (emitsLimitInOver) limit.map(_.sql).getOrElse("") else "" + s"$window($identifier) $OVER ($partitionByStr$orderByStr$limitStr)" } } @@ -345,4 +365,134 @@ package object aggregate { identifier = identifier.update(request) ) } + + /** ROW_NUMBER / RANK / DENSE_RANK — ranking-style windows. + * + * ANSI requires `ORDER BY` inside the `OVER (...)` clause for ranking functions; the parser + * enforces this so the AST always carries an `Option[OrderBy]` that is `Some(...)`. PARTITION BY + * is optional — when absent the whole result set is one partition. + * + * Distinct from the other `WindowFunction` shapes because the result is one value per ROW within + * partition, not one value per partition. The `searchWithWindowEnrichment` pipeline branches on + * this trait via pattern matching: ranking windows produce a per-row ordinal injected by lookup + * on `(partitionKey, _id)`. + */ + sealed trait RankingWindow extends WindowFunction { + override def isWindowing: Boolean = true + // Ranking windows surface their `LIMIT N` clause in the SQL round-trip + // so the top-N push-down syntax is preserved through Updateable.update. + override protected def emitsLimitInOver: Boolean = true + + /** Apply this window's tie rule to an ordered `(rowId, sortKey)` sequence. + * + * - ROW_NUMBER: sequential, no ties (1, 2, 3, 4, …) + * - RANK: ties share rank, next rank skips (1, 2, 2, 4, …) + * - DENSE_RANK: ties share rank, next rank does not skip (1, 2, 2, 3, …) + * + * Tie detection is value-equality on the full OVER ORDER BY tuple. + */ + def assignOrdinals(ordered: Seq[(String, Seq[Any])]): Seq[(String, Long)] + } + + case class RowNumber( + partitionBy: Seq[Identifier] = Seq.empty, + orderBy: Option[OrderBy], + fields: Seq[Field] = Seq.empty, + limit: Option[Limit] = None + ) extends RankingWindow { + override def identifier: Identifier = Identifier() + override def window: Window = ROW_NUMBER + override def baseType: SQLType = SQLTypes.BigInt + + override def assignOrdinals(ordered: Seq[(String, Seq[Any])]): Seq[(String, Long)] = + ordered.zipWithIndex.map { case ((rowId, _), i) => rowId -> (i + 1L) } + + override def withPartitionBy(pb: Seq[Identifier]): WindowFunction = + this.copy(partitionBy = pb) + + override def withFields(fs: Seq[Field]): WindowFunction = this.copy(fields = fs) + + override def update(request: SingleSearch): WindowFunction = super + .update(request) + .asInstanceOf[RowNumber] + .copy( + orderBy = orderBy.map(_.update(request)) + // NB: ranking windows intentionally do NOT fall back to the outer + // query LIMIT — top-N push-down comes solely from the inline `LIMIT N` + // inside OVER. Standard SQL computes window functions before LIMIT, so + // the outer LIMIT must not shrink the per-partition ranked set. + ) + } + + case class Ranking( + partitionBy: Seq[Identifier] = Seq.empty, + orderBy: Option[OrderBy], + fields: Seq[Field] = Seq.empty, + limit: Option[Limit] = None + ) extends RankingWindow { + override def identifier: Identifier = Identifier() + override def window: Window = RANK + override def baseType: SQLType = SQLTypes.BigInt + + override def assignOrdinals(ordered: Seq[(String, Seq[Any])]): Seq[(String, Long)] = { + var lastKey: Seq[Any] = null + var lastRank = 0L + ordered.zipWithIndex.map { case ((rowId, key), i) => + if (key != lastKey) { lastRank = (i + 1).toLong; lastKey = key } + rowId -> lastRank + } + } + + override def withPartitionBy(pb: Seq[Identifier]): WindowFunction = + this.copy(partitionBy = pb) + + override def withFields(fs: Seq[Field]): WindowFunction = this.copy(fields = fs) + + override def update(request: SingleSearch): WindowFunction = super + .update(request) + .asInstanceOf[Ranking] + .copy( + orderBy = orderBy.map(_.update(request)) + // NB: ranking windows intentionally do NOT fall back to the outer + // query LIMIT — top-N push-down comes solely from the inline `LIMIT N` + // inside OVER. Standard SQL computes window functions before LIMIT, so + // the outer LIMIT must not shrink the per-partition ranked set. + ) + } + + case class DenseRank( + partitionBy: Seq[Identifier] = Seq.empty, + orderBy: Option[OrderBy], + fields: Seq[Field] = Seq.empty, + limit: Option[Limit] = None + ) extends RankingWindow { + override def identifier: Identifier = Identifier() + override def window: Window = DENSE_RANK + override def baseType: SQLType = SQLTypes.BigInt + + override def assignOrdinals(ordered: Seq[(String, Seq[Any])]): Seq[(String, Long)] = { + var lastKey: Seq[Any] = null + var dense = 0L + ordered.map { case (rowId, key) => + if (key != lastKey) { dense += 1; lastKey = key } + rowId -> dense + } + } + + override def withPartitionBy(pb: Seq[Identifier]): WindowFunction = + this.copy(partitionBy = pb) + + override def withFields(fs: Seq[Field]): WindowFunction = this.copy(fields = fs) + + override def update(request: SingleSearch): WindowFunction = super + .update(request) + .asInstanceOf[DenseRank] + .copy( + orderBy = orderBy.map(_.update(request)) + // NB: ranking windows intentionally do NOT fall back to the outer + // query LIMIT — top-N push-down comes solely from the inline `LIMIT N` + // inside OVER. Standard SQL computes window functions before LIMIT, so + // the outer LIMIT must not shrink the per-partition ranked set. + ) + } } diff --git a/sql/src/main/scala/app/softnetwork/elastic/sql/parser/Parser.scala b/sql/src/main/scala/app/softnetwork/elastic/sql/parser/Parser.scala index 51b4a7e8..8bba25d2 100644 --- a/sql/src/main/scala/app/softnetwork/elastic/sql/parser/Parser.scala +++ b/sql/src/main/scala/app/softnetwork/elastic/sql/parser/Parser.scala @@ -1234,6 +1234,9 @@ trait Parser "isnotnull", "greatest", "least", + "row_number", + "rank", + "dense_rank", "date_add", "date_sub", "parse_date", diff --git a/sql/src/main/scala/app/softnetwork/elastic/sql/parser/function/aggregate/package.scala b/sql/src/main/scala/app/softnetwork/elastic/sql/parser/function/aggregate/package.scala index d86f5582..df9effd9 100644 --- a/sql/src/main/scala/app/softnetwork/elastic/sql/parser/function/aggregate/package.scala +++ b/sql/src/main/scala/app/softnetwork/elastic/sql/parser/function/aggregate/package.scala @@ -120,10 +120,34 @@ package object aggregate { SumAgg(top._1, top._2) } + /** OVER clause variant used by ranking windows: ORDER BY is REQUIRED (ANSI). Falling through to + * the optional-orderBy parser would let `ROW_NUMBER() OVER (PARTITION BY d)` parse and then + * break at execution; rejecting at parse time is preferable. + */ + private[this] def ranking_over: Parser[(Seq[Identifier], OrderBy, Option[Limit])] = + OVER.regex ~> start ~ partition_by.? ~ orderBy ~ limit.? <~ end ^^ { case _ ~ pb ~ ob ~ l => + (pb.getOrElse(Seq.empty), ob, l) + } + + def row_number: PackratParser[WindowFunction] = + ROW_NUMBER.regex ~ start ~ end ~ ranking_over ^^ { case _ ~ _ ~ _ ~ ((pb, ob, l)) => + RowNumber(partitionBy = pb, orderBy = Some(ob), limit = l) + } + + def rank: PackratParser[WindowFunction] = + RANK.regex ~ start ~ end ~ ranking_over ^^ { case _ ~ _ ~ _ ~ ((pb, ob, l)) => + Ranking(partitionBy = pb, orderBy = Some(ob), limit = l) + } + + def dense_rank: PackratParser[WindowFunction] = + DENSE_RANK.regex ~ start ~ end ~ ranking_over ^^ { case _ ~ _ ~ _ ~ ((pb, ob, l)) => + DenseRank(partitionBy = pb, orderBy = Some(ob), limit = l) + } + def identifierWithWindowFunction: PackratParser[Identifier] = - (first_value | last_value | array_agg | count_agg | min_agg | max_agg | avg_agg | sum_agg) ^^ { - th => - th.identifier.withFunctions(th +: th.identifier.functions) + (first_value | last_value | array_agg | count_agg | min_agg | max_agg | avg_agg | sum_agg | + row_number | rank | dense_rank) ^^ { th => + th.identifier.withFunctions(th +: th.identifier.functions) } } diff --git a/sql/src/test/scala/app/softnetwork/elastic/sql/parser/ParserSpec.scala b/sql/src/test/scala/app/softnetwork/elastic/sql/parser/ParserSpec.scala index 8017faaf..8816ada6 100644 --- a/sql/src/test/scala/app/softnetwork/elastic/sql/parser/ParserSpec.scala +++ b/sql/src/test/scala/app/softnetwork/elastic/sql/parser/ParserSpec.scala @@ -217,6 +217,16 @@ object Queries { "SELECT LEAST(price_us, price_eu, price_uk) AS lo, sku FROM products" val greatestLiteral: String = "SELECT GREATEST(0, price_us) AS hi FROM products" + val rowNumber: String = + "SELECT name, ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) AS rn FROM emp" + val rankSql: String = + "SELECT name, RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS r FROM emp" + val denseRank: String = + "SELECT name, DENSE_RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS dr FROM emp" + val rowNumberNoPartition: String = + "SELECT name, ROW_NUMBER() OVER (ORDER BY salary DESC) AS rn FROM emp" + val rankTopN: String = + "SELECT name, salary, RANK() OVER (PARTITION BY department ORDER BY salary DESC LIMIT 3) AS r FROM emp" val nullif: String = "SELECT COALESCE(NULLIF(createdAt, DATE_PARSE('2025-09-11', '%Y-%m-%d') - INTERVAL 2 DAY), CURRENT_DATE) AS c, identifier FROM Table" val conversion: String = @@ -947,6 +957,53 @@ class ParserSpec extends AnyFlatSpec with Matchers { .equalsIgnoreCase(greatestLiteral) shouldBe true } + it should "parse ROW_NUMBER() window" in { + val result = Parser(rowNumber) + result.toOption + .map(_.sql) + .getOrElse("") + .equalsIgnoreCase(rowNumber) shouldBe true + } + + it should "parse RANK() window" in { + val result = Parser(rankSql) + result.toOption + .map(_.sql) + .getOrElse("") + .equalsIgnoreCase(rankSql) shouldBe true + } + + it should "parse DENSE_RANK() window" in { + val result = Parser(denseRank) + result.toOption + .map(_.sql) + .getOrElse("") + .equalsIgnoreCase(denseRank) shouldBe true + } + + it should "parse ROW_NUMBER without PARTITION BY" in { + val result = Parser(rowNumberNoPartition) + result.toOption + .map(_.sql) + .getOrElse("") + .equalsIgnoreCase(rowNumberNoPartition) shouldBe true + } + + it should "parse RANK with top-N LIMIT inside OVER (push-down)" in { + val result = Parser(rankTopN) + result.toOption + .map(_.sql) + .getOrElse("") + .equalsIgnoreCase(rankTopN) shouldBe true + } + + it should "reject ROW_NUMBER() without ORDER BY (ANSI: ORDER BY REQUIRED)" in { + val result = Parser( + "SELECT ROW_NUMBER() OVER (PARTITION BY department) AS rn FROM emp" + ) + result.isLeft shouldBe true + } + it should "reject GREATEST with definitively non-numeric args" in { val result = Parser("SELECT GREATEST('a', 'b') AS hi FROM products") result.isLeft shouldBe true diff --git a/testkit/src/main/scala/app/softnetwork/elastic/client/WindowFunctionSpec.scala b/testkit/src/main/scala/app/softnetwork/elastic/client/WindowFunctionSpec.scala index 16f1f1ad..9ce27f65 100644 --- a/testkit/src/main/scala/app/softnetwork/elastic/client/WindowFunctionSpec.scala +++ b/testkit/src/main/scala/app/softnetwork/elastic/client/WindowFunctionSpec.scala @@ -221,7 +221,7 @@ trait WindowFunctionSpec ROW_NUMBER() OVER ( PARTITION BY department ORDER BY salary DESC - ) AS row_number + ) AS rnum FROM emp ORDER BY department, row_number """) @@ -229,7 +229,7 @@ trait WindowFunctionSpec results match { case ElasticSuccess(employees) => employees.groupBy(_.department).foreach { case (dept, emps) => - val rowNumbers = emps.flatMap(_.row_number).sorted + val rowNumbers = emps.flatMap(_.rnum).sorted rowNumbers shouldBe (1 to emps.size).toList info(s"$dept: ${emps.size} employees numbered 1 to ${emps.size}") @@ -250,7 +250,7 @@ trait WindowFunctionSpec RANK() OVER ( PARTITION BY department ORDER BY salary DESC - ) AS rank + ) AS rk FROM emp ORDER BY department, rank """) @@ -258,7 +258,7 @@ trait WindowFunctionSpec results match { case ElasticSuccess(employees) => employees.groupBy(_.department).foreach { case (dept, emps) => - val ranks = emps.flatMap(_.rank) + val ranks = emps.flatMap(_.rk) ranks.head shouldBe 1 // Top earner always rank 1 info(s"$dept top earner: ${emps.head.name} (${emps.head.salary})") @@ -452,6 +452,163 @@ trait WindowFunctionSpec } } + // ======================================================================== + // RANKING WINDOW FUNCTIONS (Story 14.3) + // ROW_NUMBER / RANK / DENSE_RANK with PARTITION BY + ORDER BY, + // plus top-N push-down via LIMIT inside OVER. + // ======================================================================== + + "ROW_NUMBER window function" should "assign unique 1-based ordinals per department by salary DESC" in { + val results = client.searchAs[EmployeeWithWindow](""" + SELECT + department, + name, + salary, + hire_date, + ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) AS rnum + FROM emp + LIMIT 100 + """) + + results match { + case ElasticSuccess(employees) => + employees should have size 20 + // Every employee receives a positive ordinal. + employees.foreach(_.rnum shouldBe defined) + // Within each department the ordinals are exactly 1..n with no gaps + // and the row carrying ordinal 1 has the largest salary. + employees.groupBy(_.department).foreach { case (dept, emps) => + val ordinals = emps.flatMap(_.rnum).sorted + ordinals shouldBe (1L to emps.size.toLong) + val byOrdinal = emps.flatMap(e => e.rnum.map(_ -> e)).toMap + byOrdinal(1L).salary shouldBe emps.map(_.salary).max + info(s"$dept ROW_NUMBER 1 = ${byOrdinal(1L).name} (${byOrdinal(1L).salary})") + } + + case ElasticFailure(error) => + fail(s"Query failed: ${error.message}") + } + } + + "RANK window function" should "produce values matching ROW_NUMBER on tie-free salary DESC" in { + val results = client.searchAs[EmployeeWithWindow](""" + SELECT + department, + name, + salary, + hire_date, + RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS rk + FROM emp + LIMIT 100 + """) + + results match { + case ElasticSuccess(employees) => + employees should have size 20 + // The fixture has no salary ties within any department, so RANK + // agrees with ROW_NUMBER on the (rank=1..n, no gaps) invariant. + employees.groupBy(_.department).foreach { case (dept, emps) => + val ranks = emps.flatMap(_.rk).sorted + ranks shouldBe (1L to emps.size.toLong) + val byRank = emps.flatMap(e => e.rk.map(_ -> e)).toMap + byRank(1L).salary shouldBe emps.map(_.salary).max + info(s"$dept RANK 1 = ${byRank(1L).name} (${byRank(1L).salary})") + } + + case ElasticFailure(error) => + fail(s"Query failed: ${error.message}") + } + } + + "DENSE_RANK window function" should "produce values matching ROW_NUMBER on tie-free salary DESC" in { + val results = client.searchAs[EmployeeWithWindow](""" + SELECT + department, + name, + salary, + hire_date, + DENSE_RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS drk + FROM emp + LIMIT 100 + """) + + results match { + case ElasticSuccess(employees) => + employees should have size 20 + employees.groupBy(_.department).foreach { case (dept, emps) => + val ranks = emps.flatMap(_.drk).sorted + ranks shouldBe (1L to emps.size.toLong) + val byRank = emps.flatMap(e => e.drk.map(_ -> e)).toMap + byRank(1L).salary shouldBe emps.map(_.salary).max + info(s"$dept DENSE_RANK 1 = ${byRank(1L).name} (${byRank(1L).salary})") + } + + case ElasticFailure(error) => + fail(s"Query failed: ${error.message}") + } + } + + "Ranking without PARTITION BY" should "rank every employee across the entire result set" in { + val results = client.searchAs[EmployeeWithWindow](""" + SELECT + department, + name, + salary, + hire_date, + ROW_NUMBER() OVER (ORDER BY salary DESC) AS rnum + FROM emp + LIMIT 100 + """) + + results match { + case ElasticSuccess(employees) => + employees should have size 20 + val ordinals = employees.flatMap(_.rnum).sorted + ordinals shouldBe (1L to 20L) + // The top earner overall is Sam Turner ($130k). + val byOrdinal = employees.flatMap(e => e.rnum.map(_ -> e)).toMap + byOrdinal(1L).name shouldBe "Sam Turner" + byOrdinal(1L).salary shouldBe 130000 + + case ElasticFailure(error) => + fail(s"Query failed: ${error.message}") + } + } + + "RANK with LIMIT N inside OVER" should "push N into ES top_hits.size and return only top-N per partition" in { + val results = client.searchAs[EmployeeWithWindow](""" + SELECT + department, + name, + salary, + hire_date, + RANK() OVER (PARTITION BY department ORDER BY salary DESC LIMIT 3) AS rk + FROM emp + LIMIT 100 + """) + + results match { + case ElasticSuccess(employees) => + // Only rows in the top-3 per department surface with a rank value. + // The base query still returns every row, so we filter by `rank.isDefined`. + val ranked = employees.filter(_.rk.isDefined) + ranked.groupBy(_.department).foreach { case (dept, emps) => + emps.size should be <= 3 + val ranks = emps.flatMap(_.rk).sorted + ranks shouldBe (1L to emps.size.toLong) + val top = emps.minBy(_.rk.get) + info(s"$dept top-of-3 = ${top.name} (${top.salary}) rk=${top.rk.get}") + } + // Engineering has 7 employees; the top-3 are Sam, Bob, Diana. + val engTop3 = + ranked.filter(_.department == "Engineering").sortBy(_.rk.get).map(_.name) + engTop3 shouldBe Seq("Sam Turner", "Bob Smith", "Diana Prince") + + case ElasticFailure(error) => + fail(s"Query failed: ${error.message}") + } + } + // ======================================================================== // TESTS WITH AGGREGATIONS // ======================================================================== @@ -830,7 +987,7 @@ trait WindowFunctionSpec ROW_NUMBER() OVER ( PARTITION BY department ORDER BY salary DESC - ) AS row_number + ) AS rnum FROM emp ORDER BY department, row_number """, @@ -845,11 +1002,11 @@ trait WindowFunctionSpec val employees = results.map(_._1) employees.groupBy(_.department).foreach { case (dept, emps) => - val rowNumbers = emps.flatMap(_.row_number).sorted + val rowNumbers = emps.flatMap(_.rnum).sorted rowNumbers shouldBe (1 to emps.size).toList // Top earner (row_number = 1) - val topEarner = emps.find(_.row_number.contains(1)).get + val topEarner = emps.find(_.rnum.contains(1)).get dept match { case "Engineering" => topEarner.name shouldBe "Sam Turner" @@ -877,7 +1034,7 @@ trait WindowFunctionSpec ROW_NUMBER() OVER ( PARTITION BY department ORDER BY salary DESC - ) AS row_number + ) AS rnum FROM emp ORDER BY department, row_number """, @@ -891,7 +1048,7 @@ trait WindowFunctionSpec // Filtrer les top 2 par département val top2PerDept = employees - .filter(_.row_number.exists(_ <= 2)) + .filter(_.rnum.exists(_ <= 2)) .groupBy(_.department) top2PerDept.foreach { case (dept, emps) => @@ -918,7 +1075,7 @@ trait WindowFunctionSpec RANK() OVER ( PARTITION BY department ORDER BY salary DESC - ) AS rank + ) AS rk FROM emp ORDER BY department, rank """, @@ -933,7 +1090,7 @@ trait WindowFunctionSpec val employees = results.map(_._1) employees.groupBy(_.department).foreach { case (dept, emps) => - val ranks = emps.flatMap(_.rank) + val ranks = emps.flatMap(_.rk) ranks.head shouldBe 1 // Top earner always rank 1 val topEarner = emps.head @@ -1166,8 +1323,8 @@ trait WindowFunctionSpec (emp, pctVsFirst, scrollId) } .filter { case (emp, pct, _) => - // Ne garder que les top earners (row_number <= 3) - emp.row_number.exists(_ <= 3) + // Ne garder que les top earners (rnum <= 3) + emp.rnum.exists(_ <= 3) } .runWith(Sink.seq) diff --git a/testkit/src/main/scala/app/softnetwork/elastic/model/window/package.scala b/testkit/src/main/scala/app/softnetwork/elastic/model/window/package.scala index 6ac4dac7..92d56cd8 100644 --- a/testkit/src/main/scala/app/softnetwork/elastic/model/window/package.scala +++ b/testkit/src/main/scala/app/softnetwork/elastic/model/window/package.scala @@ -41,8 +41,13 @@ package object window { skills: Option[List[String]] = None, first_salary: Option[Int] = None, last_salary: Option[Int] = None, - rank: Option[Int] = None, - row_number: Option[Int] = None + // Ranking-window fields. The SELECT aliases avoid the reserved keywords + // (row_number / rank / dense_rank) so the parser's identifier filter + // accepts them: rnum = ROW_NUMBER, rk = RANK, drk = DENSE_RANK. + // BIGINT in SQL → Long here so the searchAs macro accepts the binding. + rnum: Option[Long] = None, + rk: Option[Long] = None, + drk: Option[Long] = None ) case class DepartmentStats( From e755ddb774c88dd13c988d186eaa93f591a657bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Manciot?= Date: Sat, 6 Jun 2026 10:01:55 +0200 Subject: [PATCH 4/6] feat(sql): add STDDEV / VARIANCE statistical aggregate family Accept and translate the six ANSI statistical aggregates (STDDEV, STDDEV_POP, STDDEV_SAMP, VARIANCE, VAR_POP, VAR_SAMP), all mapping to Elasticsearch's extended_stats aggregation via one ExtendedStatsAgg case class parameterised by a 6-variant ExtendedStatsKind ADT. - STDDEV = STDDEV_SAMP, VARIANCE = VAR_SAMP (ANSI default = sample). - Sample variants project the _sampling keys (ES 7.7+); population variants project the un-suffixed keys (ES 6+). On ES < 7.7 sample variants log a warning and return null. Gated by ElasticsearchVersion.supportsStdDevVariance. - Result key carried on ClientAggregation.aggResultField, set at conversion time and projected in extractMetrics; Stats branch gated on isEmpty to avoid fallthrough. - OVER (PARTITION BY ...) supported via the aggregation window pipeline. - Help JSON (6 entries + _index), docs (dql_statements, functions_aggregate), bridge + es6/bridge SQLQuerySpec JSON validation, testkit integration test. Closed Issue #102 Co-Authored-By: Claude Opus 4.8 (1M context) --- .../sql/bridge/ElasticAggregation.scala | 11 ++ .../elastic/sql/SQLQuerySpec.scala | 111 ++++++++++++++++++ .../help/functions/aggregate/_index.json | 6 + .../help/functions/aggregate/stddev.json | 39 ++++++ .../help/functions/aggregate/stddev_pop.json | 34 ++++++ .../help/functions/aggregate/stddev_samp.json | 34 ++++++ .../help/functions/aggregate/var_pop.json | 34 ++++++ .../help/functions/aggregate/var_samp.json | 34 ++++++ .../help/functions/aggregate/variance.json | 39 ++++++ .../elastic/client/ElasticConversion.scala | 37 +++++- .../elastic/client/ElasticsearchVersion.scala | 6 + .../softnetwork/elastic/client/package.scala | 46 +++++++- documentation/sql/dql_statements.md | 22 ++++ documentation/sql/functions_aggregate.md | 63 ++++++++++ .../sql/bridge/ElasticAggregation.scala | 11 ++ .../elastic/sql/SQLQuerySpec.scala | 111 ++++++++++++++++++ .../sql/function/aggregate/package.scala | 83 +++++++++++++ .../elastic/sql/parser/Parser.scala | 6 + .../parser/function/aggregate/package.scala | 40 ++++++- .../elastic/sql/parser/ParserSpec.scala | 52 ++++++++ .../elastic/client/WindowFunctionSpec.scala | 71 +++++++++++ .../elastic/model/window/package.scala | 14 +++ 22 files changed, 898 insertions(+), 6 deletions(-) create mode 100644 core/src/main/resources/help/functions/aggregate/stddev.json create mode 100644 core/src/main/resources/help/functions/aggregate/stddev_pop.json create mode 100644 core/src/main/resources/help/functions/aggregate/stddev_samp.json create mode 100644 core/src/main/resources/help/functions/aggregate/var_pop.json create mode 100644 core/src/main/resources/help/functions/aggregate/var_samp.json create mode 100644 core/src/main/resources/help/functions/aggregate/variance.json diff --git a/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/ElasticAggregation.scala b/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/ElasticAggregation.scala index ae94f756..6ad0860a 100644 --- a/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/ElasticAggregation.scala +++ b/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/ElasticAggregation.scala @@ -40,6 +40,7 @@ import com.sksamuel.elastic4s.ElasticApi.{ bucketScriptAggregation, bucketSelectorAggregation, cardinalityAgg, + extendedStatsAgg, maxAgg, minAgg, nestedAggregation, @@ -191,6 +192,11 @@ object ElasticAggregation { case MAX => aggWithFieldOrScript(maxAgg, (name, s) => maxAgg(name, sourceField).script(s)) case AVG => aggWithFieldOrScript(avgAgg, (name, s) => avgAgg(name, sourceField).script(s)) case SUM => aggWithFieldOrScript(sumAgg, (name, s) => sumAgg(name, sourceField).script(s)) + case STDDEV | STDDEV_SAMP | STDDEV_POP | VARIANCE | VAR_SAMP | VAR_POP => + aggWithFieldOrScript( + extendedStatsAgg, + (name, s) => extendedStatsAgg(name, sourceField).script(s) + ) case th: WindowFunction => th.window match { case COUNT => @@ -212,6 +218,11 @@ object ElasticAggregation { aggWithFieldOrScript(avgAgg, (name, s) => avgAgg(name, sourceField).script(s)) case SUM => aggWithFieldOrScript(sumAgg, (name, s) => sumAgg(name, sourceField).script(s)) + case STDDEV | STDDEV_SAMP | STDDEV_POP | VARIANCE | VAR_SAMP | VAR_POP => + aggWithFieldOrScript( + extendedStatsAgg, + (name, s) => extendedStatsAgg(name, sourceField).script(s) + ) case _ => val isRanking = th.isInstanceOf[RankingWindow] val limit = { diff --git a/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala b/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala index 92408899..22397d73 100644 --- a/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala +++ b/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala @@ -4312,4 +4312,115 @@ class SQLQuerySpec extends AnyFlatSpec with Matchers { query should not include "\"missing\"" } + // === Story 14.4: STDDEV / VARIANCE family — extended_stats translation === + + it should "translate STDDEV(salary) GROUP BY department to extended_stats" in { + val select: ElasticSearchRequest = + SelectStatement( + """SELECT department, STDDEV(salary) AS sd + |FROM emp + |GROUP BY department""".stripMargin + ) + val query = select.query + query shouldBe + """{ + | "query": { "match_all": {} }, + | "size": 0, + | "_source": false, + | "aggs": { + | "department": { + | "terms": { "field": "department", "min_doc_count": 1 }, + | "aggs": { + | "sd": { "extended_stats": { "field": "salary" } } + | } + | } + | } + |}""".stripMargin.replaceAll("\\s+", "") + } + + it should "translate STDDEV_SAMP(salary) to extended_stats (alias of STDDEV)" in { + val select: ElasticSearchRequest = + SelectStatement( + """SELECT department, STDDEV_SAMP(salary) AS sd + |FROM emp + |GROUP BY department""".stripMargin + ) + val query = select.query + query should include("\"extended_stats\":{\"field\":\"salary\"}") + query should include("\"sd\":{") + } + + it should "translate STDDEV_POP(salary) to extended_stats" in { + val select: ElasticSearchRequest = + SelectStatement( + """SELECT department, STDDEV_POP(salary) AS sdp + |FROM emp + |GROUP BY department""".stripMargin + ) + val query = select.query + query should include("\"extended_stats\":{\"field\":\"salary\"}") + query should include("\"sdp\":{") + } + + it should "translate VARIANCE(salary) GROUP BY department to extended_stats" in { + val select: ElasticSearchRequest = + SelectStatement( + """SELECT department, VARIANCE(salary) AS v + |FROM emp + |GROUP BY department""".stripMargin + ) + val query = select.query + query shouldBe + """{ + | "query": { "match_all": {} }, + | "size": 0, + | "_source": false, + | "aggs": { + | "department": { + | "terms": { "field": "department", "min_doc_count": 1 }, + | "aggs": { + | "v": { "extended_stats": { "field": "salary" } } + | } + | } + | } + |}""".stripMargin.replaceAll("\\s+", "") + } + + it should "translate VAR_SAMP(salary) to extended_stats (alias of VARIANCE)" in { + val select: ElasticSearchRequest = + SelectStatement( + """SELECT department, VAR_SAMP(salary) AS v + |FROM emp + |GROUP BY department""".stripMargin + ) + val query = select.query + query should include("\"extended_stats\":{\"field\":\"salary\"}") + query should include("\"v\":{") + } + + it should "translate VAR_POP(salary) to extended_stats" in { + val select: ElasticSearchRequest = + SelectStatement( + """SELECT department, VAR_POP(salary) AS vp + |FROM emp + |GROUP BY department""".stripMargin + ) + val query = select.query + query should include("\"extended_stats\":{\"field\":\"salary\"}") + query should include("\"vp\":{") + } + + it should "translate VARIANCE(salary) OVER (PARTITION BY department) to extended_stats" in { + val select: ElasticSearchRequest = + SelectStatement( + """SELECT name, salary, VARIANCE(salary) OVER (PARTITION BY department) AS v + |FROM emp""".stripMargin + ) + val query = select.query + // PARTITION BY department => a `department` terms bucket; the windowed + // VARIANCE lives as an `extended_stats` sub-aggregation against `salary`. + query should include("\"terms\":{\"field\":\"department\"") + query should include("\"extended_stats\":{\"field\":\"salary\"}") + } + } diff --git a/core/src/main/resources/help/functions/aggregate/_index.json b/core/src/main/resources/help/functions/aggregate/_index.json index 7fe4fd51..e1caf4ed 100644 --- a/core/src/main/resources/help/functions/aggregate/_index.json +++ b/core/src/main/resources/help/functions/aggregate/_index.json @@ -4,6 +4,12 @@ "avg.json", "min.json", "max.json", + "stddev.json", + "stddev_samp.json", + "stddev_pop.json", + "variance.json", + "var_samp.json", + "var_pop.json", "array_agg.json", "first_value.json", "last_value.json", diff --git a/core/src/main/resources/help/functions/aggregate/stddev.json b/core/src/main/resources/help/functions/aggregate/stddev.json new file mode 100644 index 00000000..cd0e427b --- /dev/null +++ b/core/src/main/resources/help/functions/aggregate/stddev.json @@ -0,0 +1,39 @@ +{ + "name": "STDDEV", + "category": "Aggregate", + "shortDescription": "Sample standard deviation of values", + "syntax": [ + "STDDEV(column)", + "STDDEV(column) OVER (PARTITION BY partition_expr, ...)" + ], + "description": "Computes the sample standard deviation (Bessel-corrected). Alias for STDDEV_SAMP — matches PostgreSQL and Snowflake defaults.", + "parameters": [ + { + "name": "column", + "type": "NUMERIC", + "description": "Numeric column", + "optional": false, + "defaultValue": null + } + ], + "returnType": "DOUBLE", + "examples": [ + { + "title": "Sample standard deviation", + "description": "Across all rows", + "sql": "SELECT STDDEV(salary) FROM emp" + }, + { + "title": "Per group", + "description": "By department", + "sql": "SELECT department, STDDEV(salary) AS sd FROM emp GROUP BY department" + } + ], + "notes": [ + "Defaults to sample (matches PostgreSQL / Snowflake / MySQL 8.0+); use STDDEV_POP for population.", + "NULL values are ignored.", + "Sample variants (STDDEV / STDDEV_SAMP / VARIANCE / VAR_SAMP) require Elasticsearch 7.7+; population variants work on ES 6+." + ], + "seeAlso": ["STDDEV_POP", "STDDEV_SAMP", "VARIANCE", "VAR_POP", "VAR_SAMP"], + "aliases": ["STDDEV_SAMP"] +} diff --git a/core/src/main/resources/help/functions/aggregate/stddev_pop.json b/core/src/main/resources/help/functions/aggregate/stddev_pop.json new file mode 100644 index 00000000..2b9ea515 --- /dev/null +++ b/core/src/main/resources/help/functions/aggregate/stddev_pop.json @@ -0,0 +1,34 @@ +{ + "name": "STDDEV_POP", + "category": "Aggregate", + "shortDescription": "Population standard deviation of values", + "syntax": [ + "STDDEV_POP(column)", + "STDDEV_POP(column) OVER (PARTITION BY partition_expr, ...)" + ], + "description": "Computes the population standard deviation (divides by n). Use when the input is the entire population, not a sample.", + "parameters": [ + { + "name": "column", + "type": "NUMERIC", + "description": "Numeric column", + "optional": false, + "defaultValue": null + } + ], + "returnType": "DOUBLE", + "examples": [ + { + "title": "Population standard deviation", + "description": "Across all rows", + "sql": "SELECT STDDEV_POP(salary) FROM emp" + } + ], + "notes": [ + "Population variant — divides by n, not n-1.", + "NULL values are ignored.", + "Works on Elasticsearch 6+." + ], + "seeAlso": ["STDDEV", "STDDEV_SAMP", "VAR_POP"], + "aliases": [] +} diff --git a/core/src/main/resources/help/functions/aggregate/stddev_samp.json b/core/src/main/resources/help/functions/aggregate/stddev_samp.json new file mode 100644 index 00000000..1af2de52 --- /dev/null +++ b/core/src/main/resources/help/functions/aggregate/stddev_samp.json @@ -0,0 +1,34 @@ +{ + "name": "STDDEV_SAMP", + "category": "Aggregate", + "shortDescription": "Sample standard deviation of values", + "syntax": [ + "STDDEV_SAMP(column)", + "STDDEV_SAMP(column) OVER (PARTITION BY partition_expr, ...)" + ], + "description": "Computes the sample standard deviation (Bessel-corrected, divides by n-1). Equivalent to STDDEV.", + "parameters": [ + { + "name": "column", + "type": "NUMERIC", + "description": "Numeric column", + "optional": false, + "defaultValue": null + } + ], + "returnType": "DOUBLE", + "examples": [ + { + "title": "Sample standard deviation", + "description": "Equivalent to STDDEV", + "sql": "SELECT STDDEV_SAMP(salary) FROM emp" + } + ], + "notes": [ + "Identical semantics to STDDEV (ANSI default is sample).", + "NULL values are ignored.", + "Requires Elasticsearch 7.7+." + ], + "seeAlso": ["STDDEV", "STDDEV_POP", "VAR_SAMP"], + "aliases": [] +} diff --git a/core/src/main/resources/help/functions/aggregate/var_pop.json b/core/src/main/resources/help/functions/aggregate/var_pop.json new file mode 100644 index 00000000..4eea2946 --- /dev/null +++ b/core/src/main/resources/help/functions/aggregate/var_pop.json @@ -0,0 +1,34 @@ +{ + "name": "VAR_POP", + "category": "Aggregate", + "shortDescription": "Population variance of values", + "syntax": [ + "VAR_POP(column)", + "VAR_POP(column) OVER (PARTITION BY partition_expr, ...)" + ], + "description": "Computes the population variance (divides by n). Use when the input is the entire population, not a sample.", + "parameters": [ + { + "name": "column", + "type": "NUMERIC", + "description": "Numeric column", + "optional": false, + "defaultValue": null + } + ], + "returnType": "DOUBLE", + "examples": [ + { + "title": "Population variance", + "description": "Across all rows", + "sql": "SELECT VAR_POP(salary) FROM emp" + } + ], + "notes": [ + "Population variant — divides by n, not n-1.", + "NULL values are ignored.", + "Works on Elasticsearch 6+." + ], + "seeAlso": ["VARIANCE", "VAR_SAMP", "STDDEV_POP"], + "aliases": [] +} diff --git a/core/src/main/resources/help/functions/aggregate/var_samp.json b/core/src/main/resources/help/functions/aggregate/var_samp.json new file mode 100644 index 00000000..4ea55be0 --- /dev/null +++ b/core/src/main/resources/help/functions/aggregate/var_samp.json @@ -0,0 +1,34 @@ +{ + "name": "VAR_SAMP", + "category": "Aggregate", + "shortDescription": "Sample variance of values", + "syntax": [ + "VAR_SAMP(column)", + "VAR_SAMP(column) OVER (PARTITION BY partition_expr, ...)" + ], + "description": "Computes the sample variance (Bessel-corrected, divides by n-1). Equivalent to VARIANCE.", + "parameters": [ + { + "name": "column", + "type": "NUMERIC", + "description": "Numeric column", + "optional": false, + "defaultValue": null + } + ], + "returnType": "DOUBLE", + "examples": [ + { + "title": "Sample variance", + "description": "Equivalent to VARIANCE", + "sql": "SELECT VAR_SAMP(salary) FROM emp" + } + ], + "notes": [ + "Identical semantics to VARIANCE (ANSI default is sample).", + "NULL values are ignored.", + "Requires Elasticsearch 7.7+." + ], + "seeAlso": ["VARIANCE", "VAR_POP", "STDDEV_SAMP"], + "aliases": [] +} diff --git a/core/src/main/resources/help/functions/aggregate/variance.json b/core/src/main/resources/help/functions/aggregate/variance.json new file mode 100644 index 00000000..82bfd255 --- /dev/null +++ b/core/src/main/resources/help/functions/aggregate/variance.json @@ -0,0 +1,39 @@ +{ + "name": "VARIANCE", + "category": "Aggregate", + "shortDescription": "Sample variance of values", + "syntax": [ + "VARIANCE(column)", + "VARIANCE(column) OVER (PARTITION BY partition_expr, ...)" + ], + "description": "Computes the sample variance (Bessel-corrected). Alias for VAR_SAMP — matches PostgreSQL and Snowflake defaults.", + "parameters": [ + { + "name": "column", + "type": "NUMERIC", + "description": "Numeric column", + "optional": false, + "defaultValue": null + } + ], + "returnType": "DOUBLE", + "examples": [ + { + "title": "Sample variance", + "description": "Across all rows", + "sql": "SELECT VARIANCE(salary) FROM emp" + }, + { + "title": "Per group", + "description": "By department", + "sql": "SELECT department, VARIANCE(salary) AS v FROM emp GROUP BY department" + } + ], + "notes": [ + "Defaults to sample (matches PostgreSQL / Snowflake); use VAR_POP for population.", + "NULL values are ignored.", + "Sample variants (STDDEV / STDDEV_SAMP / VARIANCE / VAR_SAMP) require Elasticsearch 7.7+; population variants work on ES 6+." + ], + "seeAlso": ["VAR_POP", "VAR_SAMP", "STDDEV", "STDDEV_POP", "STDDEV_SAMP"], + "aliases": ["VAR_SAMP"] +} diff --git a/core/src/main/scala/app/softnetwork/elastic/client/ElasticConversion.scala b/core/src/main/scala/app/softnetwork/elastic/client/ElasticConversion.scala index 7c25705e..8477d7d4 100644 --- a/core/src/main/scala/app/softnetwork/elastic/client/ElasticConversion.scala +++ b/core/src/main/scala/app/softnetwork/elastic/client/ElasticConversion.scala @@ -28,6 +28,11 @@ import scala.jdk.CollectionConverters._ trait ElasticConversion { + // Distinctly named to avoid colliding with the `logger` member that + // `ClientCompanion`'s self-type contributes where this trait is mixed in. + private val conversionLogger: org.slf4j.Logger = + org.slf4j.LoggerFactory.getLogger(getClass) + def convertTo[T](map: Map[String, Any])(implicit m: Manifest[T], formats: Formats): T = { val jValue = Extraction.decompose(map) jValue.extract[T] @@ -614,8 +619,36 @@ trait ElasticConversion { name -> numericValue } .orElse { - // Stats aggregations - if (value.has("count") && value.has("sum") && value.has("avg")) { + // Extended stats — project the SQL-requested field via + // ClientAggregation.aggResultField, set at SQLAggregation → + // ClientAggregation conversion time for STDDEV / STDDEV_POP / + // STDDEV_SAMP / VARIANCE / VAR_POP / VAR_SAMP. When the projection + // key is absent (the `_sampling` sample keys require ES 7.7+), + // logs a warning and yields None so the column appears as null — + // the Stats branch below is skipped for these aggregations (see + // its `aggResultField.isEmpty` guard) to avoid emitting a + // stats-shaped struct in place of the null. + aggregations.get(name).flatMap(_.aggResultField).flatMap { resultField => + Option(value.get(resultField)).filterNot(_.isNull) match { + case Some(node) => Some(name -> node.asDouble()) + case None => + conversionLogger.warn( + s"Aggregation '$name' requested extended_stats field '$resultField' " + + "which is absent from the response (sample variants require " + + "Elasticsearch 7.7+); the column will be null." + ) + None + } + } + } + .orElse { + // Stats aggregations — skipped when this aggregation projects a + // specific extended_stats field (handled above; an absent key + // yields a null column rather than a stats-shaped struct). + if ( + aggregations.get(name).flatMap(_.aggResultField).isEmpty && + value.has("count") && value.has("sum") && value.has("avg") + ) { Some( name -> ListMap( "count" -> value.get("count").asLong(), diff --git a/core/src/main/scala/app/softnetwork/elastic/client/ElasticsearchVersion.scala b/core/src/main/scala/app/softnetwork/elastic/client/ElasticsearchVersion.scala index 27f6ccb1..b814fddd 100644 --- a/core/src/main/scala/app/softnetwork/elastic/client/ElasticsearchVersion.scala +++ b/core/src/main/scala/app/softnetwork/elastic/client/ElasticsearchVersion.scala @@ -179,4 +179,10 @@ object ElasticsearchVersion { def supportsQueryWatchers(version: String): Boolean = { isAtLeast(version, 7, 11) } + + /** Check if standard deviation aggregation is supported (ES >= 7.7) + */ + def supportsStdDevVariance(version: String): Boolean = { + isAtLeast(version, 7, 7) + } } diff --git a/core/src/main/scala/app/softnetwork/elastic/client/package.scala b/core/src/main/scala/app/softnetwork/elastic/client/package.scala index 94bfb778..23c86cc7 100644 --- a/core/src/main/scala/app/softnetwork/elastic/client/package.scala +++ b/core/src/main/scala/app/softnetwork/elastic/client/package.scala @@ -344,7 +344,12 @@ package object client extends SerializationApi { // ordinal computed Scala-side by the searchWithWindowEnrichment // pipeline (RankingKind in function.aggregate); the ordinal is then // injected into the base-query row by (partitionKey, _id) lookup. - RowNumber, Rank, DenseRank = Value + RowNumber, Rank, DenseRank, + // STDDEV / VARIANCE family — all back the same ES `extended_stats` + // aggregation; the specific result key is carried separately on + // ClientAggregation.aggResultField so extractMetrics knows which + // field to project from the response. + Stddev, StddevSamp, StddevPop, Variance, VarSamp, VarPop = Value } /** Client Aggregation @@ -370,7 +375,13 @@ package object client extends SerializationApi { windowing: Boolean, bucketPath: String, bucketRoot: String, - auxiliary: Boolean = false + auxiliary: Boolean = false, + // Response field projected from a multi-key ES aggregation (currently + // `extended_stats` — e.g. "std_deviation_sampling", "variance"). The + // un-suffixed "std_deviation"/"variance" keys are the population values + // (ES 6+); the "_sampling" keys are the sample values (ES 7.7+). + // None for plain `value`-style metrics. + aggResultField: Option[String] = None ) { def multivalued: Boolean = aggType == AggregationType.ArrayAgg || @@ -395,6 +406,12 @@ package object client extends SerializationApi { case MAX => AggregationType.Max case AVG => AggregationType.Avg case SUM => AggregationType.Sum + case STDDEV => AggregationType.Stddev + case STDDEV_SAMP => AggregationType.StddevSamp + case STDDEV_POP => AggregationType.StddevPop + case VARIANCE => AggregationType.Variance + case VAR_SAMP => AggregationType.VarSamp + case VAR_POP => AggregationType.VarPop case _: FirstValue => AggregationType.FirstValue case _: LastValue => AggregationType.LastValue case _: ArrayAgg => AggregationType.ArrayAgg @@ -406,8 +423,30 @@ package object client extends SerializationApi { case _: RowNumber => AggregationType.RowNumber case _: Ranking => AggregationType.Rank case _: DenseRank => AggregationType.DenseRank + case e: ExtendedStatsAgg => + e.kind match { + case ExtendedStatsKind.Stddev => AggregationType.Stddev + case ExtendedStatsKind.StddevSamp => AggregationType.StddevSamp + case ExtendedStatsKind.StddevPop => AggregationType.StddevPop + case ExtendedStatsKind.Variance => AggregationType.Variance + case ExtendedStatsKind.VarSamp => AggregationType.VarSamp + case ExtendedStatsKind.VarPop => AggregationType.VarPop + } case _ => throw new IllegalArgumentException(s"Unsupported aggregation type: ${agg.aggType}") } + // `extended_stats` is multi-key — pick which one to project. Plain + // tokens (STDDEV / STDDEV_POP / …) get a fixed key matching the SQL + // semantic; the wrapped ExtendedStatsAgg carries it on the kind. + val aggResultField: Option[String] = agg.aggType match { + case STDDEV => Some(ExtendedStatsKind.Stddev.resultField) + case STDDEV_SAMP => Some(ExtendedStatsKind.StddevSamp.resultField) + case STDDEV_POP => Some(ExtendedStatsKind.StddevPop.resultField) + case VARIANCE => Some(ExtendedStatsKind.Variance.resultField) + case VAR_SAMP => Some(ExtendedStatsKind.VarSamp.resultField) + case VAR_POP => Some(ExtendedStatsKind.VarPop.resultField) + case e: ExtendedStatsAgg => Some(e.kind.resultField) + case _ => None + } ClientAggregation( agg.aggName, aggType, @@ -416,7 +455,8 @@ package object client extends SerializationApi { agg.aggType.isWindowing, agg.bucketPath, agg.bucketRoot, - agg.auxiliary + agg.auxiliary, + aggResultField ) } diff --git a/documentation/sql/dql_statements.md b/documentation/sql/dql_statements.md index c05cde20..59ca4969 100644 --- a/documentation/sql/dql_statements.md +++ b/documentation/sql/dql_statements.md @@ -284,6 +284,26 @@ Supported aggregate functions include: - `AVG(expr)` - `MIN(expr)` - `MAX(expr)` +- `STDDEV(expr)` / `STDDEV_SAMP(expr)` / `STDDEV_POP(expr)` +- `VARIANCE(expr)` / `VAR_SAMP(expr)` / `VAR_POP(expr)` + +`STDDEV` defaults to **sample** standard deviation (Bessel-corrected, `STDDEV ≡ STDDEV_SAMP`) and +`VARIANCE` defaults to **sample** variance (`VARIANCE ≡ VAR_SAMP`). This matches PostgreSQL and +Snowflake; users coming from MySQL 5.5 or earlier should note that those releases defaulted +`STDDEV` to population. + +```sql +SELECT department, + STDDEV(salary) AS sd, + VAR_POP(salary) AS vp +FROM emp +GROUP BY department; +``` + +All six map to a single Elasticsearch `extended_stats` aggregation per call; the requested field +(`std_deviation_sampling`, `variance_sampling` for the sample variants; the un-suffixed +`std_deviation`, `variance` for the population variants) is projected from the response. Sample +variants require **Elasticsearch 7.7+**; population variants work on Elasticsearch 6+. ### GROUP BY and HAVING @@ -366,6 +386,8 @@ Supported window functions include: - `SUM(expr) OVER (...)` - `COUNT(expr) OVER (...)` +- `STDDEV(expr) OVER (PARTITION BY ...)` and its `_SAMP` / `_POP` variants +- `VARIANCE(expr) OVER (PARTITION BY ...)` and its `_SAMP` / `_POP` variants - `FIRST_VALUE(expr) OVER (...)` - `LAST_VALUE(expr) OVER (...)` - `ARRAY_AGG(expr) OVER (...)` diff --git a/documentation/sql/functions_aggregate.md b/documentation/sql/functions_aggregate.md index c23f8e07..e8babf61 100644 --- a/documentation/sql/functions_aggregate.md +++ b/documentation/sql/functions_aggregate.md @@ -18,6 +18,7 @@ This page documents aggregate functions for summarizing and analyzing data. 6. [FIRST_VALUE](#function-first_value) 7. [LAST_VALUE](#function-last_value) 8. [ARRAY_AGG](#function-array_agg) +9. [STDDEV / VARIANCE family](#function-stddev--variance-family) --- @@ -1204,6 +1205,62 @@ FROM emp; --- +## Function: STDDEV / VARIANCE family + +**Description:** +The six ANSI statistical aggregates compute the standard deviation and variance of a numeric column. All map to a single Elasticsearch `extended_stats` aggregation per call; each function projects the matching field from the response. + +`STDDEV` is an alias for `STDDEV_SAMP`, and `VARIANCE` is an alias for `VAR_SAMP` — i.e. both default to the **sample** (Bessel-corrected) form, matching PostgreSQL, Snowflake, and MySQL 8.0+. (MySQL 5.5 and earlier defaulted `STDDEV` to population.) + +**Syntax:** +```sql +STDDEV(expr) | STDDEV_SAMP(expr) | STDDEV_POP(expr) +VARIANCE(expr) | VAR_SAMP(expr) | VAR_POP(expr) + +-- windowed form +STDDEV(expr) OVER (PARTITION BY partition_expr, ...) +``` + +**Inputs:** +- `expr` - Numeric column +- `PARTITION BY` - Optional grouping columns (windowed form) + +**Output:** +- `DOUBLE` + +**Function → Elasticsearch `extended_stats` field:** + +| SQL | ES `extended_stats` field | Min ES version | +|--------------------|----------------------------|----------------| +| `STDDEV(x)` | `std_deviation_sampling` | **7.7+** | +| `STDDEV_SAMP(x)` | `std_deviation_sampling` | **7.7+** | +| `STDDEV_POP(x)` | `std_deviation` | 6+ | +| `VARIANCE(x)` | `variance_sampling` | **7.7+** | +| `VAR_SAMP(x)` | `variance_sampling` | **7.7+** | +| `VAR_POP(x)` | `variance` | 6+ | + +**Behavior:** +- `NULL` values are ignored. +- The un-suffixed `std_deviation` / `variance` keys are the **population** values (present on Elasticsearch 6+); the `_sampling` keys are the **sample** values (introduced in Elasticsearch 7.7). Consequently the sample variants — including the default `STDDEV` / `VARIANCE` — require Elasticsearch 7.7+. On older clusters the column is returned as `null` and a warning is logged. +- Each call emits its own `extended_stats` aggregation; two stat calls over the same column emit two aggregations. + +**Examples:** +```sql +-- Per-group sample standard deviation and population variance +SELECT department, + STDDEV(salary) AS sd, + VAR_POP(salary) AS vp +FROM emp +GROUP BY department; + +-- Windowed sample variance per partition +SELECT name, salary, + VARIANCE(salary) OVER (PARTITION BY department) AS v +FROM emp; +``` + +--- + ## Aggregate Functions Summary | Function | Purpose | Input | Output | NULL Handling | @@ -1218,5 +1275,11 @@ FROM emp; | `FIRST_VALUE(expr)` | First value (ordered) | Any | Same as input | Depends on ORDER | | `LAST_VALUE(expr)` | Last value (ordered) | Any | Same as input | Depends on ORDER | | `ARRAY_AGG(expr)` | Collect into array | Any | `ARRAY` | Includes NULLs | +| `STDDEV(expr)` | Sample std deviation | Numeric | `DOUBLE` | Ignores NULLs | +| `STDDEV_SAMP(expr)` | Sample std deviation | Numeric | `DOUBLE` | Ignores NULLs | +| `STDDEV_POP(expr)` | Population std dev | Numeric | `DOUBLE` | Ignores NULLs | +| `VARIANCE(expr)` | Sample variance | Numeric | `DOUBLE` | Ignores NULLs | +| `VAR_SAMP(expr)` | Sample variance | Numeric | `DOUBLE` | Ignores NULLs | +| `VAR_POP(expr)` | Population variance | Numeric | `DOUBLE` | Ignores NULLs | [Back to index](README.md) diff --git a/es6/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/ElasticAggregation.scala b/es6/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/ElasticAggregation.scala index 4174617b..d49e42c1 100644 --- a/es6/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/ElasticAggregation.scala +++ b/es6/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/ElasticAggregation.scala @@ -40,6 +40,7 @@ import com.sksamuel.elastic4s.ElasticApi.{ bucketScriptAggregation, bucketSelectorAggregation, cardinalityAgg, + extendedStatsAgg, maxAgg, minAgg, nestedAggregation, @@ -192,6 +193,11 @@ object ElasticAggregation { case MAX => aggWithFieldOrScript(maxAgg, (name, s) => maxAgg(name, sourceField).script(s)) case AVG => aggWithFieldOrScript(avgAgg, (name, s) => avgAgg(name, sourceField).script(s)) case SUM => aggWithFieldOrScript(sumAgg, (name, s) => sumAgg(name, sourceField).script(s)) + case STDDEV | STDDEV_SAMP | STDDEV_POP | VARIANCE | VAR_SAMP | VAR_POP => + aggWithFieldOrScript( + extendedStatsAgg, + (name, s) => extendedStatsAgg(name, sourceField).script(s) + ) case th: WindowFunction => th.window match { case COUNT => @@ -213,6 +219,11 @@ object ElasticAggregation { aggWithFieldOrScript(avgAgg, (name, s) => avgAgg(name, sourceField).script(s)) case SUM => aggWithFieldOrScript(sumAgg, (name, s) => sumAgg(name, sourceField).script(s)) + case STDDEV | STDDEV_SAMP | STDDEV_POP | VARIANCE | VAR_SAMP | VAR_POP => + aggWithFieldOrScript( + extendedStatsAgg, + (name, s) => extendedStatsAgg(name, sourceField).script(s) + ) case _ => val isRanking = th.isInstanceOf[RankingWindow] val limit = { diff --git a/es6/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala b/es6/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala index 6715b010..c18fcc41 100644 --- a/es6/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala +++ b/es6/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala @@ -4376,4 +4376,115 @@ class SQLQuerySpec extends AnyFlatSpec with Matchers { query should not include "\"missing\"" } + // === Story 14.4: STDDEV / VARIANCE family — extended_stats translation === + + it should "translate STDDEV(salary) GROUP BY department to extended_stats" in { + val select: ElasticSearchRequest = + SelectStatement( + """SELECT department, STDDEV(salary) AS sd + |FROM emp + |GROUP BY department""".stripMargin + ) + val query = select.query + query shouldBe + """{ + | "query": { "match_all": {} }, + | "size": 0, + | "_source": false, + | "aggs": { + | "department": { + | "terms": { "field": "department", "min_doc_count": 1 }, + | "aggs": { + | "sd": { "extended_stats": { "field": "salary" } } + | } + | } + | } + |}""".stripMargin.replaceAll("\\s+", "") + } + + it should "translate STDDEV_SAMP(salary) to extended_stats (alias of STDDEV)" in { + val select: ElasticSearchRequest = + SelectStatement( + """SELECT department, STDDEV_SAMP(salary) AS sd + |FROM emp + |GROUP BY department""".stripMargin + ) + val query = select.query + query should include("\"extended_stats\":{\"field\":\"salary\"}") + query should include("\"sd\":{") + } + + it should "translate STDDEV_POP(salary) to extended_stats" in { + val select: ElasticSearchRequest = + SelectStatement( + """SELECT department, STDDEV_POP(salary) AS sdp + |FROM emp + |GROUP BY department""".stripMargin + ) + val query = select.query + query should include("\"extended_stats\":{\"field\":\"salary\"}") + query should include("\"sdp\":{") + } + + it should "translate VARIANCE(salary) GROUP BY department to extended_stats" in { + val select: ElasticSearchRequest = + SelectStatement( + """SELECT department, VARIANCE(salary) AS v + |FROM emp + |GROUP BY department""".stripMargin + ) + val query = select.query + query shouldBe + """{ + | "query": { "match_all": {} }, + | "size": 0, + | "_source": false, + | "aggs": { + | "department": { + | "terms": { "field": "department", "min_doc_count": 1 }, + | "aggs": { + | "v": { "extended_stats": { "field": "salary" } } + | } + | } + | } + |}""".stripMargin.replaceAll("\\s+", "") + } + + it should "translate VAR_SAMP(salary) to extended_stats (alias of VARIANCE)" in { + val select: ElasticSearchRequest = + SelectStatement( + """SELECT department, VAR_SAMP(salary) AS v + |FROM emp + |GROUP BY department""".stripMargin + ) + val query = select.query + query should include("\"extended_stats\":{\"field\":\"salary\"}") + query should include("\"v\":{") + } + + it should "translate VAR_POP(salary) to extended_stats" in { + val select: ElasticSearchRequest = + SelectStatement( + """SELECT department, VAR_POP(salary) AS vp + |FROM emp + |GROUP BY department""".stripMargin + ) + val query = select.query + query should include("\"extended_stats\":{\"field\":\"salary\"}") + query should include("\"vp\":{") + } + + it should "translate VARIANCE(salary) OVER (PARTITION BY department) to extended_stats" in { + val select: ElasticSearchRequest = + SelectStatement( + """SELECT name, salary, VARIANCE(salary) OVER (PARTITION BY department) AS v + |FROM emp""".stripMargin + ) + val query = select.query + // PARTITION BY department => a `department` terms bucket; the windowed + // VARIANCE lives as an `extended_stats` sub-aggregation against `salary`. + query should include("\"terms\":{\"field\":\"department\"") + query should include("\"extended_stats\":{\"field\":\"salary\"}") + } + } diff --git a/sql/src/main/scala/app/softnetwork/elastic/sql/function/aggregate/package.scala b/sql/src/main/scala/app/softnetwork/elastic/sql/function/aggregate/package.scala index 0abc9710..ec199efc 100644 --- a/sql/src/main/scala/app/softnetwork/elastic/sql/function/aggregate/package.scala +++ b/sql/src/main/scala/app/softnetwork/elastic/sql/function/aggregate/package.scala @@ -49,6 +49,16 @@ package object aggregate { case object SUM extends Expr("SUM") with AggregateFunction with Window + // STDDEV / VARIANCE family — all translate to the same ES `extended_stats` + // aggregation; the SQL token is preserved for round-trip and used to pick + // the response key (`std_deviation` vs `std_deviation_population`, etc.). + case object STDDEV extends Expr("STDDEV") with AggregateFunction with Window + case object STDDEV_POP extends Expr("STDDEV_POP") with AggregateFunction with Window + case object STDDEV_SAMP extends Expr("STDDEV_SAMP") with AggregateFunction with Window + case object VARIANCE extends Expr("VARIANCE") with AggregateFunction with Window + case object VAR_POP extends Expr("VAR_POP") with AggregateFunction with Window + case object VAR_SAMP extends Expr("VAR_SAMP") with AggregateFunction with Window + sealed trait Window extends TokenRegex case object FIRST_VALUE extends Expr("FIRST_VALUE") with Window { @@ -366,6 +376,79 @@ package object aggregate { ) } + /** STDDEV / VARIANCE family. All six SQL functions translate to a single ES `extended_stats` + * aggregation; the bridge emits one aggregation per call and the `kind` here drives result-field + * projection during response extraction (`std_deviation`, `variance`, etc.). + * + * `STDDEV` ≡ `STDDEV_SAMP` and `VARIANCE` ≡ `VAR_SAMP` (ANSI defaults to sample — matches + * PostgreSQL and Snowflake). The two pairs carry distinct kinds so the SQL form round-trips + * faithfully through `WindowFunction.sql`. + */ + sealed trait ExtendedStatsKind extends Product with Serializable { + def window: Window + def resultField: String + } + + object ExtendedStatsKind { + // ES `extended_stats` quirk: the un-suffixed `std_deviation` / `variance` + // fields are the POPULATION values (kept for backwards compatibility + // with the pre-7.7 response shape). The explicit sample values live + // under the `_sampling` keys, which were introduced in ES 7.7 — so SQL + // SAMP variants (default for ANSI STDDEV / VARIANCE) require ES 7.7+. + // POP variants work on ES 6+ via the un-suffixed keys. + case object Stddev extends ExtendedStatsKind { + val window: Window = STDDEV + val resultField: String = "std_deviation_sampling" + } + case object StddevSamp extends ExtendedStatsKind { + val window: Window = STDDEV_SAMP + val resultField: String = "std_deviation_sampling" + } + case object StddevPop extends ExtendedStatsKind { + val window: Window = STDDEV_POP + val resultField: String = "std_deviation" + } + case object Variance extends ExtendedStatsKind { + val window: Window = VARIANCE + val resultField: String = "variance_sampling" + } + case object VarSamp extends ExtendedStatsKind { + val window: Window = VAR_SAMP + val resultField: String = "variance_sampling" + } + case object VarPop extends ExtendedStatsKind { + val window: Window = VAR_POP + val resultField: String = "variance" + } + } + + case class ExtendedStatsAgg( + identifier: Identifier, + kind: ExtendedStatsKind, + partitionBy: Seq[Identifier] = Seq.empty, + fields: Seq[Field] = Seq.empty + ) extends WindowFunction { + override def baseType: SQLType = SQLTypes.Double + + override def limit: Option[Limit] = None + + override def orderBy: Option[OrderBy] = None + + override def window: Window = kind.window + + override def withPartitionBy(partitionBy: Seq[Identifier]): WindowFunction = + this.copy(partitionBy = partitionBy) + + override def withFields(fields: Seq[Field]): WindowFunction = this.copy(fields = fields) + + override def update(request: SingleSearch): WindowFunction = super + .update(request) + .asInstanceOf[ExtendedStatsAgg] + .copy( + identifier = identifier.update(request) + ) + } + /** ROW_NUMBER / RANK / DENSE_RANK — ranking-style windows. * * ANSI requires `ORDER BY` inside the `OVER (...)` clause for ranking functions; the parser diff --git a/sql/src/main/scala/app/softnetwork/elastic/sql/parser/Parser.scala b/sql/src/main/scala/app/softnetwork/elastic/sql/parser/Parser.scala index 8bba25d2..fd0b00cf 100644 --- a/sql/src/main/scala/app/softnetwork/elastic/sql/parser/Parser.scala +++ b/sql/src/main/scala/app/softnetwork/elastic/sql/parser/Parser.scala @@ -1289,6 +1289,12 @@ trait Parser "max", "avg", "sum", + "stddev", + "stddev_pop", + "stddev_samp", + "variance", + "var_pop", + "var_samp", "case", "when", "then", diff --git a/sql/src/main/scala/app/softnetwork/elastic/sql/parser/function/aggregate/package.scala b/sql/src/main/scala/app/softnetwork/elastic/sql/parser/function/aggregate/package.scala index df9effd9..75cd8b28 100644 --- a/sql/src/main/scala/app/softnetwork/elastic/sql/parser/function/aggregate/package.scala +++ b/sql/src/main/scala/app/softnetwork/elastic/sql/parser/function/aggregate/package.scala @@ -35,7 +35,24 @@ package object aggregate { def sum: PackratParser[AggregateFunction] = SUM.regex ^^ (_ => SUM) - def aggregate_function: PackratParser[AggregateFunction] = count | min | max | avg | sum + def stddev: PackratParser[AggregateFunction] = STDDEV.regex ^^ (_ => STDDEV) + + def stddev_pop: PackratParser[AggregateFunction] = STDDEV_POP.regex ^^ (_ => STDDEV_POP) + + def stddev_samp: PackratParser[AggregateFunction] = STDDEV_SAMP.regex ^^ (_ => STDDEV_SAMP) + + def variance: PackratParser[AggregateFunction] = VARIANCE.regex ^^ (_ => VARIANCE) + + def var_pop: PackratParser[AggregateFunction] = VAR_POP.regex ^^ (_ => VAR_POP) + + def var_samp: PackratParser[AggregateFunction] = VAR_SAMP.regex ^^ (_ => VAR_SAMP) + + // Longest-prefix alternation: STDDEV_POP / STDDEV_SAMP / VAR_POP / VAR_SAMP must be tried + // before the bare STDDEV / VARIANCE so the suffixed forms are not shadowed. + def aggregate_function: PackratParser[AggregateFunction] = + count | min | max | avg | sum | + stddev_pop | stddev_samp | stddev | + var_pop | var_samp | variance def aggWithFunction: PackratParser[Identifier] = identifierWithArithmeticExpression | @@ -120,6 +137,26 @@ package object aggregate { SumAgg(top._1, top._2) } + def stddev_agg: PackratParser[WindowFunction] = + (stddev_pop | stddev_samp | stddev) ~ window_function(aggWithFunction) ^^ { case fn ~ top => + val kind = fn match { + case STDDEV_POP => ExtendedStatsKind.StddevPop + case STDDEV_SAMP => ExtendedStatsKind.StddevSamp + case _ => ExtendedStatsKind.Stddev + } + ExtendedStatsAgg(top._1, kind, top._2) + } + + def variance_agg: PackratParser[WindowFunction] = + (var_pop | var_samp | variance) ~ window_function(aggWithFunction) ^^ { case fn ~ top => + val kind = fn match { + case VAR_POP => ExtendedStatsKind.VarPop + case VAR_SAMP => ExtendedStatsKind.VarSamp + case _ => ExtendedStatsKind.Variance + } + ExtendedStatsAgg(top._1, kind, top._2) + } + /** OVER clause variant used by ranking windows: ORDER BY is REQUIRED (ANSI). Falling through to * the optional-orderBy parser would let `ROW_NUMBER() OVER (PARTITION BY d)` parse and then * break at execution; rejecting at parse time is preferable. @@ -146,6 +183,7 @@ package object aggregate { def identifierWithWindowFunction: PackratParser[Identifier] = (first_value | last_value | array_agg | count_agg | min_agg | max_agg | avg_agg | sum_agg | + stddev_agg | variance_agg | row_number | rank | dense_rank) ^^ { th => th.identifier.withFunctions(th +: th.identifier.functions) } diff --git a/sql/src/test/scala/app/softnetwork/elastic/sql/parser/ParserSpec.scala b/sql/src/test/scala/app/softnetwork/elastic/sql/parser/ParserSpec.scala index 8816ada6..308582e5 100644 --- a/sql/src/test/scala/app/softnetwork/elastic/sql/parser/ParserSpec.scala +++ b/sql/src/test/scala/app/softnetwork/elastic/sql/parser/ParserSpec.scala @@ -227,6 +227,21 @@ object Queries { "SELECT name, ROW_NUMBER() OVER (ORDER BY salary DESC) AS rn FROM emp" val rankTopN: String = "SELECT name, salary, RANK() OVER (PARTITION BY department ORDER BY salary DESC LIMIT 3) AS r FROM emp" + // Story 14.4 — STDDEV / VARIANCE family + val stddevSql: String = + "SELECT department, STDDEV(salary) AS sd FROM emp GROUP BY department" + val stddevSampSql: String = + "SELECT department, STDDEV_SAMP(salary) AS sd FROM emp GROUP BY department" + val stddevPopSql: String = + "SELECT department, STDDEV_POP(salary) AS sdp FROM emp GROUP BY department" + val varianceSql: String = + "SELECT department, VARIANCE(salary) AS v FROM emp GROUP BY department" + val varSampSql: String = + "SELECT department, VAR_SAMP(salary) AS v FROM emp GROUP BY department" + val varPopSql: String = + "SELECT department, VAR_POP(salary) AS vp FROM emp GROUP BY department" + val varianceOverSql: String = + "SELECT name, salary, VARIANCE(salary) OVER (PARTITION BY department) AS v FROM emp" val nullif: String = "SELECT COALESCE(NULLIF(createdAt, DATE_PARSE('2025-09-11', '%Y-%m-%d') - INTERVAL 2 DAY), CURRENT_DATE) AS c, identifier FROM Table" val conversion: String = @@ -1004,6 +1019,43 @@ class ParserSpec extends AnyFlatSpec with Matchers { result.isLeft shouldBe true } + // === Story 14.4: STDDEV / VARIANCE family === + + it should "parse STDDEV(expr) and round-trip the SQL token" in { + val result = Parser(stddevSql) + result.toOption.map(_.sql).getOrElse("").equalsIgnoreCase(stddevSql) shouldBe true + } + + it should "parse STDDEV_SAMP(expr) and round-trip the SQL token" in { + val result = Parser(stddevSampSql) + result.toOption.map(_.sql).getOrElse("").equalsIgnoreCase(stddevSampSql) shouldBe true + } + + it should "parse STDDEV_POP(expr) and round-trip the SQL token" in { + val result = Parser(stddevPopSql) + result.toOption.map(_.sql).getOrElse("").equalsIgnoreCase(stddevPopSql) shouldBe true + } + + it should "parse VARIANCE(expr) and round-trip the SQL token" in { + val result = Parser(varianceSql) + result.toOption.map(_.sql).getOrElse("").equalsIgnoreCase(varianceSql) shouldBe true + } + + it should "parse VAR_SAMP(expr) and round-trip the SQL token" in { + val result = Parser(varSampSql) + result.toOption.map(_.sql).getOrElse("").equalsIgnoreCase(varSampSql) shouldBe true + } + + it should "parse VAR_POP(expr) and round-trip the SQL token" in { + val result = Parser(varPopSql) + result.toOption.map(_.sql).getOrElse("").equalsIgnoreCase(varPopSql) shouldBe true + } + + it should "parse VARIANCE OVER (PARTITION BY ...) as a windowed aggregate" in { + val result = Parser(varianceOverSql) + result.toOption.map(_.sql).getOrElse("").equalsIgnoreCase(varianceOverSql) shouldBe true + } + it should "reject GREATEST with definitively non-numeric args" in { val result = Parser("SELECT GREATEST('a', 'b') AS hi FROM products") result.isLeft shouldBe true diff --git a/testkit/src/main/scala/app/softnetwork/elastic/client/WindowFunctionSpec.scala b/testkit/src/main/scala/app/softnetwork/elastic/client/WindowFunctionSpec.scala index 9ce27f65..51d97e51 100644 --- a/testkit/src/main/scala/app/softnetwork/elastic/client/WindowFunctionSpec.scala +++ b/testkit/src/main/scala/app/softnetwork/elastic/client/WindowFunctionSpec.scala @@ -43,6 +43,15 @@ trait WindowFunctionSpec implicit val localDateOrdering: Ordering[LocalDate] = Ordering.by(_.toEpochDay) + def supportsStdDevVariance: Boolean = { + client.asInstanceOf[VersionApi].version match { + case ElasticSuccess(v) => ElasticsearchVersion.supportsStdDevVariance(v) + case ElasticFailure(error) => + log.error(s"❌ Failed to retrieve Elasticsearch version: ${error.message}") + false + } + } + override def beforeAll(): Unit = { super.beforeAll() @@ -1962,4 +1971,66 @@ trait WindowFunctionSpec } } + // ======================================================================== + // STORY 14.4 — STDDEV / VARIANCE family (extended_stats) + // ======================================================================== + + "STDDEV / VARIANCE family" should "compute sample stddev, sample variance and pop variance per department" in { + assume( + supportsStdDevVariance, + "Sample STDDEV / VARIANCE require the extended_stats `_sampling` keys (Elasticsearch 7.7+)" + ) + + val results = client.searchAs[DepartmentStatsExtended]( + """ + SELECT + department, + STDDEV(salary) AS sd_salary, + VAR_POP(salary) AS vp_salary, + VAR_SAMP(salary) AS vs_salary + FROM emp + GROUP BY department + """ + ) + + results match { + case ElasticSuccess(rows) => + rows should not be empty + + // Engineering fixture salaries (see EmployeeData): + // 95k, 120k, 85k, 110k, 75k, 105k, 130k. + // mean = 102857.14 + // sum of squared deviations ≈ 2,242,857,142.86 + // sample variance ≈ 373,809,523.81 (÷ n-1 = 6) + // pop variance ≈ 320,408,163.27 (÷ n = 7) + // sample stddev ≈ 19,334.16 (sqrt of sample variance) + val eng = rows.find(_.department == "Engineering").getOrElse(fail("no Engineering row")) + + eng.sd_salary should not be empty + eng.vs_salary should not be empty + eng.vp_salary should not be empty + + eng.sd_salary.get shouldBe 19334.16 +- 1.0 + eng.vs_salary.get shouldBe 3.738e8 +- 1e6 + eng.vp_salary.get shouldBe 3.204e8 +- 1e6 + + // Sanity check across all departments: sample variance ≥ population variance + // (Bessel correction always gives a strictly-larger sample variance when n ≥ 2). + rows.foreach { r => + (r.vs_salary, r.vp_salary) match { + case (Some(vs), Some(vp)) => + vs should be >= vp + log.info( + f"${r.department}%-12s sd=${r.sd_salary.getOrElse(0.0)}%10.2f vp=$vp%14.2f vs=$vs%14.2f" + ) + case _ => + log.info(s"${r.department}: missing values (sample-only ES?)") + } + } + + case ElasticFailure(error) => + fail(s"Query failed: ${error.message}") + } + } + } diff --git a/testkit/src/main/scala/app/softnetwork/elastic/model/window/package.scala b/testkit/src/main/scala/app/softnetwork/elastic/model/window/package.scala index 92d56cd8..5110caca 100644 --- a/testkit/src/main/scala/app/softnetwork/elastic/model/window/package.scala +++ b/testkit/src/main/scala/app/softnetwork/elastic/model/window/package.scala @@ -58,6 +58,20 @@ package object window { employee_count: Long ) + /** Story 14.4 — STDDEV / VARIANCE family integration shape. + * + * All four projections come from a single ES `extended_stats` per call. Fields are + * `Option[Double]` because the sample variants return `null` on ES < 7.7 (the `_sampling` keys + * do not exist there); the population variants read the un-suffixed `std_deviation` / `variance` + * keys, which are present on ES 6+. + */ + case class DepartmentStatsExtended( + department: String, + sd_salary: Option[Double] = None, + vp_salary: Option[Double] = None, + vs_salary: Option[Double] = None + ) + case class DepartmentWithWindow( department: String, location: Option[String] = None, From 2592bc17e3bb2e26804aad8362eb3fd38a2b4f7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Manciot?= Date: Sun, 7 Jun 2026 09:00:11 +0200 Subject: [PATCH 5/6] feat(sql): add PERCENTILE_CONT / PERCENTILE_DISC percentile aggregates Maps both ANSI ordered-set percentile aggregates to the Elasticsearch `percentiles` aggregation (TDigest). One `PercentileAgg` case class; five syntax forms (WITHIN GROUP, OVER PARTITION BY/ORDER BY, top-level GROUP BY, (col, p) shorthand, bare whole-result-set), case-insensitive, p in [0,1] enforced at compile time. Multiple percentile calls on the same value column/partition coalesce into a single ES agg (sourceAgg delegate, split back per SQL alias at extraction). Nested `values.` projection with numeric-proximity fallback for drifted fractional keys; locale-independent percent label. PERCENTILE_DISC is continuous-backed (documented). es6/bridge mirrored by hand. Closed Issue #103 Co-Authored-By: Claude Opus 4.8 (1M context) --- .../sql/bridge/ElasticAggregation.scala | 12 ++ .../elastic/sql/bridge/package.scala | 48 ++++++-- .../elastic/sql/SQLQuerySpec.scala | 110 ++++++++++++++++++ .../help/functions/aggregate/_index.json | 2 + .../functions/aggregate/percentile_cont.json | 31 +++++ .../functions/aggregate/percentile_disc.json | 29 +++++ .../elastic/client/ElasticConversion.scala | 75 +++++++++--- .../elastic/client/SearchApi.scala | 33 +++++- .../softnetwork/elastic/client/package.scala | 20 +++- .../client/ElasticConversionSpec.scala | 36 ++++++ documentation/sql/dql_statements.md | 23 ++++ documentation/sql/functions_aggregate.md | 50 ++++++++ documentation/sql/keywords.md | 20 +++- .../sql/bridge/ElasticAggregation.scala | 12 ++ .../elastic/sql/bridge/package.scala | 45 +++++-- .../elastic/sql/SQLQuerySpec.scala | 105 +++++++++++++++++ .../sql/function/aggregate/package.scala | 103 ++++++++++++++++ .../elastic/sql/parser/Parser.scala | 2 + .../parser/function/aggregate/package.scala | 64 +++++++++- .../elastic/sql/parser/ParserSpec.scala | 50 ++++++++ .../elastic/client/WindowFunctionSpec.scala | 79 +++++++++++++ .../elastic/model/window/package.scala | 23 ++++ 22 files changed, 934 insertions(+), 38 deletions(-) create mode 100644 core/src/main/resources/help/functions/aggregate/percentile_cont.json create mode 100644 core/src/main/resources/help/functions/aggregate/percentile_disc.json diff --git a/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/ElasticAggregation.scala b/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/ElasticAggregation.scala index 6ad0860a..a6a7b359 100644 --- a/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/ElasticAggregation.scala +++ b/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/ElasticAggregation.scala @@ -44,6 +44,7 @@ import com.sksamuel.elastic4s.ElasticApi.{ maxAgg, minAgg, nestedAggregation, + percentilesAgg, sumAgg, termsAgg, topHitsAgg, @@ -223,6 +224,17 @@ object ElasticAggregation { extendedStatsAgg, (name, s) => extendedStatsAgg(name, sourceField).script(s) ) + case PERCENTILE_CONT | PERCENTILE_DISC => + // Both map to ES `percentiles` (TDigest). One call → one percent; + // the requested value column is `sourceField` (PercentileAgg.identifier). + val pct: Seq[Double] = th match { + case p: PercentileAgg => Seq(p.percent) + case _ => Seq.empty + } + aggWithFieldOrScript( + (name, field) => percentilesAgg(name, field).percents(pct), + (name, s) => percentilesAgg(name, sourceField).percents(pct).script(s) + ) case _ => val isRanking = th.isInstanceOf[RankingWindow] val limit = { diff --git a/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/package.scala b/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/package.scala index f828e25a..63fc7d0d 100644 --- a/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/package.scala +++ b/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/package.scala @@ -24,7 +24,7 @@ import app.softnetwork.elastic.sql.`type`.{ SQLVarchar } import app.softnetwork.elastic.sql.config.ElasticSqlConfig -import app.softnetwork.elastic.sql.function.aggregate.COUNT +import app.softnetwork.elastic.sql.function.aggregate.{COUNT, PercentileAgg} import app.softnetwork.elastic.sql.function.geo.{Distance, Meters} import app.softnetwork.elastic.sql.operator._ import app.softnetwork.elastic.sql.query._ @@ -39,6 +39,7 @@ import com.sksamuel.elastic4s.requests.searches.aggs.{ AbstractAggregation, FilterAggregation, NestedAggregation, + PercentilesAggregation, TermsAggregation } import com.sksamuel.elastic4s.requests.searches.queries.compound.BoolQuery @@ -455,6 +456,37 @@ package object bridge { request.orderBy.map(_.sorts).getOrElse(Seq.empty) ).minScore(request.score) + /** Merge percentile ElasticAggregations that share a value column / `cont` flag / partition into + * the FIRST of them (the owner): set the owner's ES `percentiles` `percents` to the group's + * sorted-distinct union and drop the delegates. `.percents` is called on the owner's existing + * `PercentilesAggregation`, preserving its field/script. Mirrors + * `SearchApi.toClientAggregations` (both call [[PercentileAgg.coalescePlan]] on the same + * SELECT-ordered items, so they pick the same owner). Only percentiles sharing the same + * partition merge, so a merged agg always distributes to one bucket. + */ + private def coalescePercentileAggs( + aggs: Seq[ElasticAggregation] + ): Seq[ElasticAggregation] = { + val items = aggs.collect { + case ea if ea.aggType.isInstanceOf[PercentileAgg] => + ea.aggName -> ea.aggType.asInstanceOf[PercentileAgg] + } + if (items.size < 2) aggs + else { + val plan = PercentileAgg.coalescePlan(items) + aggs.flatMap { ea => + if (plan.isDelegate(ea.aggName)) None + else if (plan.isOwner(ea.aggName)) + Some( + ea.copy(agg = + ea.agg.asInstanceOf[PercentilesAggregation].percents(plan.mergedPercents(ea.aggName)) + ) + ) + else Some(ea) + } + } + } + implicit def requestToSearchRequest( request: SingleSearch )(implicit @@ -463,12 +495,14 @@ package object bridge { ): SearchRequest = { import request._ - val aggregations = request.aggregates.map( - ElasticAggregation( - _, - request.having.flatMap(_.criteria), - request.sorts, - request.sqlAggregations + val aggregations = coalescePercentileAggs( + request.aggregates.map( + ElasticAggregation( + _, + request.having.flatMap(_.criteria), + request.sorts, + request.sqlAggregations + ) ) ) diff --git a/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala b/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala index 22397d73..169a098f 100644 --- a/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala +++ b/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala @@ -4423,4 +4423,114 @@ class SQLQuerySpec extends AnyFlatSpec with Matchers { query should include("\"extended_stats\":{\"field\":\"salary\"}") } + // === Story 14.5: PERCENTILE_CONT / PERCENTILE_DISC — percentiles translation === + + it should "translate PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY salary) GROUP BY department" in { + val select: ElasticSearchRequest = + SelectStatement( + """SELECT department, PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY salary) AS p99 + |FROM emp + |GROUP BY department""".stripMargin + ) + val query = select.query + query shouldBe + """{ + | "query": { "match_all": {} }, + | "size": 0, + | "_source": false, + | "aggs": { + | "department": { + | "terms": { "field": "department", "min_doc_count": 1 }, + | "aggs": { + | "p99": { "percentiles": { "field": "salary", "percents": [99.0] } } + | } + | } + | } + |}""".stripMargin.replaceAll("\\s+", "") + } + + it should "translate the PERCENTILE_CONT(salary, 0.95) shorthand to percentiles" in { + val select: ElasticSearchRequest = + SelectStatement( + """SELECT department, PERCENTILE_CONT(salary, 0.95) AS p95 + |FROM emp + |GROUP BY department""".stripMargin + ) + val query = select.query + query shouldBe + """{ + | "query": { "match_all": {} }, + | "size": 0, + | "_source": false, + | "aggs": { + | "department": { + | "terms": { "field": "department", "min_doc_count": 1 }, + | "aggs": { + | "p95": { "percentiles": { "field": "salary", "percents": [95.0] } } + | } + | } + | } + |}""".stripMargin.replaceAll("\\s+", "") + } + + it should "translate PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY salary) to percentiles" in { + val select: ElasticSearchRequest = + SelectStatement( + "SELECT PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY salary) AS median FROM emp" + ) + select.query shouldBe + """{ + | "query": { "match_all": {} }, + | "size": 0, + | "_source": false, + | "aggs": { + | "median": { "percentiles": { "field": "salary", "percents": [50.0] } } + | } + |}""".stripMargin.replaceAll("\\s+", "") + } + + it should "translate PERCENTILE_CONT(0.9) OVER (PARTITION BY department ORDER BY salary)" in { + val select: ElasticSearchRequest = + SelectStatement( + """SELECT name, PERCENTILE_CONT(0.9) OVER (PARTITION BY department ORDER BY salary) AS p90 + |FROM emp""".stripMargin + ) + val query = select.query + // PARTITION BY department => a `department` terms bucket; the windowed + // percentile lives as a `percentiles` sub-aggregation against `salary`. + query should include("\"terms\":{\"field\":\"department\"") + query should include("\"percentiles\":{\"field\":\"salary\",\"percents\":[90.0]}") + } + + it should "coalesce multiple PERCENTILE_CONT on the same column into one percentiles agg" in { + val select: ElasticSearchRequest = + SelectStatement( + """SELECT department, + | PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY salary) AS p50, + | PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY salary) AS p95, + | PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY salary) AS p99 + |FROM emp + |GROUP BY department""".stripMargin + ) + val query = select.query + // The three calls collapse to ONE `percentiles` agg (owned by p50) with the + // merged percents; p95 / p99 read from p50's `values` at extraction time and + // are returned under their own aliases. + query shouldBe + """{ + | "query": { "match_all": {} }, + | "size": 0, + | "_source": false, + | "aggs": { + | "department": { + | "terms": { "field": "department", "min_doc_count": 1 }, + | "aggs": { + | "p50": { "percentiles": { "field": "salary", "percents": [50.0, 95.0, 99.0] } } + | } + | } + | } + |}""".stripMargin.replaceAll("\\s+", "") + "\"percentiles\"".r.findAllIn(query).length shouldBe 1 + } + } diff --git a/core/src/main/resources/help/functions/aggregate/_index.json b/core/src/main/resources/help/functions/aggregate/_index.json index e1caf4ed..f8819f20 100644 --- a/core/src/main/resources/help/functions/aggregate/_index.json +++ b/core/src/main/resources/help/functions/aggregate/_index.json @@ -10,6 +10,8 @@ "variance.json", "var_samp.json", "var_pop.json", + "percentile_cont.json", + "percentile_disc.json", "array_agg.json", "first_value.json", "last_value.json", diff --git a/core/src/main/resources/help/functions/aggregate/percentile_cont.json b/core/src/main/resources/help/functions/aggregate/percentile_cont.json new file mode 100644 index 00000000..86cd6488 --- /dev/null +++ b/core/src/main/resources/help/functions/aggregate/percentile_cont.json @@ -0,0 +1,31 @@ +{ + "name": "PERCENTILE_CONT", + "category": "Aggregate", + "shortDescription": "Continuous (interpolated) percentile of a numeric column", + "syntax": [ + "PERCENTILE_CONT(p) WITHIN GROUP (ORDER BY column)", + "PERCENTILE_CONT(p) WITHIN GROUP (ORDER BY column) OVER (PARTITION BY partition_expr, ...)", + "PERCENTILE_CONT(p) OVER (PARTITION BY partition_expr, ... ORDER BY column)", + "PERCENTILE_CONT(column, p)" + ], + "description": "Computes a continuous (interpolated) percentile. Maps to the Elasticsearch `percentiles` aggregation (TDigest). p is a literal between 0 and 1, e.g. 0.99 for p99. The value column is given by the ORDER BY clause (WITHIN GROUP or OVER); grouping is given by OVER (PARTITION BY ...) or a top-level GROUP BY. A column-first shorthand PERCENTILE_CONT(column, p) is also accepted (many BI tools emit it).", + "parameters": [ + { "name": "p", "type": "NUMERIC", "description": "Percentile fraction in [0,1], e.g. 0.99", "optional": false, "defaultValue": null }, + { "name": "column", "type": "NUMERIC", "description": "Value column (from the ORDER BY clause, or the shorthand first argument)", "optional": false, "defaultValue": null } + ], + "returnType": "DOUBLE", + "examples": [ + { "title": "p99 latency per endpoint", "description": "SRE latency analysis", "sql": "SELECT endpoint, PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY duration_ms) AS p99 FROM requests GROUP BY endpoint" }, + { "title": "OVER form", "description": "Partition and value column in OVER", "sql": "SELECT name, PERCENTILE_CONT(0.99) OVER (PARTITION BY department ORDER BY salary) AS p99 FROM emp" }, + { "title": "Shorthand", "description": "BI-tool friendly", "sql": "SELECT department, PERCENTILE_CONT(salary, 0.95) AS p95 FROM emp GROUP BY department" } + ], + "notes": [ + "Maps to the Elasticsearch `percentiles` aggregation (TDigest, continuous/interpolated).", + "The value column comes from the ORDER BY clause (WITHIN GROUP or OVER), or the (column, p) shorthand's first argument.", + "Multiple percentile calls on the same column coalesce into a single Elasticsearch aggregation.", + "NULL values are ignored.", + "Works on Elasticsearch 6+." + ], + "seeAlso": ["PERCENTILE_DISC", "AVG", "STDDEV"], + "aliases": [] +} diff --git a/core/src/main/resources/help/functions/aggregate/percentile_disc.json b/core/src/main/resources/help/functions/aggregate/percentile_disc.json new file mode 100644 index 00000000..7799cdf7 --- /dev/null +++ b/core/src/main/resources/help/functions/aggregate/percentile_disc.json @@ -0,0 +1,29 @@ +{ + "name": "PERCENTILE_DISC", + "category": "Aggregate", + "shortDescription": "Discrete percentile (continuous-backed) of a numeric column", + "syntax": [ + "PERCENTILE_DISC(p) WITHIN GROUP (ORDER BY column)", + "PERCENTILE_DISC(p) WITHIN GROUP (ORDER BY column) OVER (PARTITION BY partition_expr, ...)", + "PERCENTILE_DISC(p) OVER (PARTITION BY partition_expr, ... ORDER BY column)", + "PERCENTILE_DISC(column, p)" + ], + "description": "Computes a discrete percentile. p is a literal between 0 and 1, e.g. 0.99 for p99. The value column is given by the ORDER BY clause (WITHIN GROUP or OVER); grouping is given by OVER (PARTITION BY ...) or a top-level GROUP BY. A column-first shorthand PERCENTILE_DISC(column, p) is also accepted.", + "parameters": [ + { "name": "p", "type": "NUMERIC", "description": "Percentile fraction in [0,1], e.g. 0.99", "optional": false, "defaultValue": null }, + { "name": "column", "type": "NUMERIC", "description": "Value column (from the ORDER BY clause, or the shorthand first argument)", "optional": false, "defaultValue": null } + ], + "returnType": "DOUBLE", + "examples": [ + { "title": "p95 per department", "description": "Discrete percentile", "sql": "SELECT department, PERCENTILE_DISC(0.95) WITHIN GROUP (ORDER BY salary) AS p95 FROM emp GROUP BY department" } + ], + "notes": [ + "Maps to the Elasticsearch `percentiles` aggregation (TDigest). Elasticsearch has no native discrete percentile, so PERCENTILE_DISC returns the same interpolated value as PERCENTILE_CONT — it does NOT (yet) round to the nearest actual data point as ANSI PERCENTILE_DISC specifies.", + "The value column comes from the ORDER BY clause (WITHIN GROUP or OVER), or the (column, p) shorthand's first argument.", + "Multiple percentile calls on the same column coalesce into a single Elasticsearch aggregation.", + "NULL values are ignored.", + "Works on Elasticsearch 6+." + ], + "seeAlso": ["PERCENTILE_CONT", "AVG", "STDDEV"], + "aliases": [] +} diff --git a/core/src/main/scala/app/softnetwork/elastic/client/ElasticConversion.scala b/core/src/main/scala/app/softnetwork/elastic/client/ElasticConversion.scala index 8477d7d4..d24e3af9 100644 --- a/core/src/main/scala/app/softnetwork/elastic/client/ElasticConversion.scala +++ b/core/src/main/scala/app/softnetwork/elastic/client/ElasticConversion.scala @@ -588,6 +588,38 @@ trait ElasticConversion { .orElse(Try(LocalTime.parse(text, isoTimeFormatter)).toOption) } + private def parseDoubleOpt(s: String): Option[Double] = Try(s.toDouble).toOption + + /** Project a multi-key result field (set via `ClientAggregation.aggResultField`) out of an + * aggregation node. Tries the exact key top-level (extended_stats, e.g. "variance") then nested + * under the `values` object (percentiles, e.g. "99.0"). + * + * When the exact key is absent from `values`, falls back to numeric-proximity matching: + * Elasticsearch labels percentile keys from the requested percent, and a fractional percentile + * sent as a float (`0.07 → 7.000000000000001`) can be echoed under a slightly different string. + * Matching the closest `values` key within a small epsilon keeps the column populated instead of + * silently nulling it. + */ + private def projectAggResultField(value: JsonNode, resultField: String): Option[Double] = { + val direct = Option(value.get(resultField)).filterNot(_.isNull) + val valuesNode = Option(value.get("values")) + val nested = valuesNode.flatMap(vs => Option(vs.get(resultField))).filterNot(_.isNull) + direct.orElse(nested).map(_.asDouble()).orElse { + (valuesNode, parseDoubleOpt(resultField)) match { + case (Some(vs: ObjectNode), Some(want)) => + var best: Option[(Double, JsonNode)] = None + vs.forEachEntry { (k, v) => + parseDoubleOpt(k).foreach { kd => + if (!v.isNull && best.forall(b => Math.abs(kd - want) < Math.abs(b._1 - want))) + best = Some(kd -> v) + } + } + best.collect { case (kd, node) if Math.abs(kd - want) <= 1e-6 => node.asDouble() } + case _ => None + } + } + } + /** Extract metrics from an aggregation node */ def extractMetrics( @@ -619,23 +651,25 @@ trait ElasticConversion { name -> numericValue } .orElse { - // Extended stats — project the SQL-requested field via + // Multi-key projection — project the SQL-requested field via // ClientAggregation.aggResultField, set at SQLAggregation → - // ClientAggregation conversion time for STDDEV / STDDEV_POP / - // STDDEV_SAMP / VARIANCE / VAR_POP / VAR_SAMP. When the projection - // key is absent (the `_sampling` sample keys require ES 7.7+), - // logs a warning and yields None so the column appears as null — - // the Stats branch below is skipped for these aggregations (see - // its `aggResultField.isEmpty` guard) to avoid emitting a - // stats-shaped struct in place of the null. + // ClientAggregation conversion time. Covers: + // * extended_stats (STDDEV / VARIANCE) — key is top-level, e.g. + // "variance"; the `_sampling` sample keys require ES 7.7+. + // * percentiles (PERCENTILE_CONT / PERCENTILE_DISC) — key is + // nested under the response `values` object, e.g. "99.0". + // When the key is absent, logs a warning and yields None so the + // column appears as null — the Stats branch below is skipped for + // these aggregations (see its `aggResultField.isEmpty` guard) to + // avoid emitting a stats-shaped struct in place of the null. aggregations.get(name).flatMap(_.aggResultField).flatMap { resultField => - Option(value.get(resultField)).filterNot(_.isNull) match { - case Some(node) => Some(name -> node.asDouble()) + projectAggResultField(value, resultField) match { + case Some(d) => Some(name -> d) case None => conversionLogger.warn( - s"Aggregation '$name' requested extended_stats field '$resultField' " + - "which is absent from the response (sample variants require " + - "Elasticsearch 7.7+); the column will be null." + s"Aggregation '$name' requested result field '$resultField' " + + "which is absent from the response (extended_stats sample variants " + + "require Elasticsearch 7.7+); the column will be null." ) None } @@ -682,6 +716,21 @@ trait ElasticConversion { if (!isAuxiliary) metrics ++= Seq(m._1 -> m._2) case _ => } + // Coalesced percentile delegates: sibling columns whose `sourceAgg` + // points at THIS node read their own percentile from its `values` + // object (the owner column was projected by the aggResultField branch + // above). Set on delegates by SearchApi.toClientAggregations. + // Skip auxiliary delegates (HAVING/WHERE/ORDER BY only, not in SELECT) + // so they do not leak into the result columns — mirrors the !isAuxiliary + // guard on the main projection branch above. + aggregations.foreach { case (delegateName, ca) => + if (ca.sourceAgg.contains(name) && !ca.auxiliary) { + ca.aggResultField.foreach { resultField => + projectAggResultField(value, resultField) + .foreach(d => metrics ++= Seq(delegateName -> d)) + } + } + } } ListMap((bucketRoot match { case Some(root) => metrics ++ Seq("bucket_root" -> root) diff --git a/core/src/main/scala/app/softnetwork/elastic/client/SearchApi.scala b/core/src/main/scala/app/softnetwork/elastic/client/SearchApi.scala index 6570a163..5deacaea 100644 --- a/core/src/main/scala/app/softnetwork/elastic/client/SearchApi.scala +++ b/core/src/main/scala/app/softnetwork/elastic/client/SearchApi.scala @@ -23,7 +23,7 @@ import app.softnetwork.elastic.client.result.{ ElasticSuccess } import app.softnetwork.elastic.sql.PainlessContextType -import app.softnetwork.elastic.sql.function.aggregate.RankingWindow +import app.softnetwork.elastic.sql.function.aggregate.{PercentileAgg, RankingWindow} import app.softnetwork.elastic.sql.macros.SQLQueryMacros import app.softnetwork.elastic.sql.query.{ MultiSearch, @@ -158,6 +158,29 @@ trait SearchApi extends ElasticConversion with ElasticClientHelpers { } } + /** Convert SELECT aggregations to ClientAggregations, applying percentile coalescing: columns + * sharing a value column / `cont` flag / partition become delegates of the first (owner) column + * and read their value from the owner's shared ES `percentiles` response node. Mirrors the + * bridge's percent-merge (both call [[PercentileAgg.coalescePlan]] on the same SELECT-ordered + * items, so they always pick the same owner). + */ + private def toClientAggregations( + aggregations: ListMap[String, SQLAggregation] + ): ListMap[String, ClientAggregation] = { + val aggs0 = aggregations.map(kv => kv._1 -> implicitly[ClientAggregation](kv._2)) + val percentileItems = aggregations.toSeq.collect { + case (name, sa) if sa.aggType.isInstanceOf[PercentileAgg] => + name -> sa.aggType.asInstanceOf[PercentileAgg] + } + if (percentileItems.size < 2) aggs0 + else { + val plan = PercentileAgg.coalescePlan(percentileItems) + aggs0.map { case (name, ca) => + name -> (if (plan.isDelegate(name)) ca.copy(sourceAgg = plan.ownerOf.get(name)) else ca) + } + } + } + /** Search for documents / aggregations matching the Elasticsearch query. * * @param elasticQuery @@ -202,7 +225,7 @@ trait SearchApi extends ElasticConversion with ElasticClientHelpers { logger.info( s"✅ Successfully executed search for query \n$elasticQuery\nin indices '$indices'" ) - val aggs = aggregations.map(kv => kv._1 -> implicitly[ClientAggregation](kv._2)) + val aggs = toClientAggregations(aggregations) ElasticResult.fromTry( parseResponse( response, @@ -311,7 +334,7 @@ trait SearchApi extends ElasticConversion with ElasticClientHelpers { logger.info( s"✅ Successfully executed multi-search for query \n$elasticQueries" ) - val aggs = aggregations.map(kv => kv._1 -> implicitly[ClientAggregation](kv._2)) + val aggs = toClientAggregations(aggregations) ElasticResult.fromTry( parseResponse( response, @@ -488,7 +511,7 @@ trait SearchApi extends ElasticConversion with ElasticClientHelpers { logger.info( s"✅ Successfully executed asynchronous search for query \n$elasticQuery\nin indices '$indices'" ) - val aggs = aggregations.map(kv => kv._1 -> implicitly[ClientAggregation](kv._2)) + val aggs = toClientAggregations(aggregations) ElasticResult.fromTry( parseResponse( response, @@ -585,7 +608,7 @@ trait SearchApi extends ElasticConversion with ElasticClientHelpers { logger.info( s"✅ Successfully executed asynchronous multi-search for query \n$elasticQueries" ) - val aggs = aggregations.map(kv => kv._1 -> implicitly[ClientAggregation](kv._2)) + val aggs = toClientAggregations(aggregations) ElasticResult.fromTry( parseResponse( response, diff --git a/core/src/main/scala/app/softnetwork/elastic/client/package.scala b/core/src/main/scala/app/softnetwork/elastic/client/package.scala index 23c86cc7..c820f039 100644 --- a/core/src/main/scala/app/softnetwork/elastic/client/package.scala +++ b/core/src/main/scala/app/softnetwork/elastic/client/package.scala @@ -349,7 +349,11 @@ package object client extends SerializationApi { // aggregation; the specific result key is carried separately on // ClientAggregation.aggResultField so extractMetrics knows which // field to project from the response. - Stddev, StddevSamp, StddevPop, Variance, VarSamp, VarPop = Value + Stddev, StddevSamp, StddevPop, Variance, VarSamp, VarPop, + // PERCENTILE_CONT / PERCENTILE_DISC — both back the ES `percentiles` + // aggregation; the requested percentile key (e.g. "99.0") is carried on + // ClientAggregation.aggResultField and projected from the response `values`. + PercentileCont, PercentileDisc = Value } /** Client Aggregation @@ -381,7 +385,12 @@ package object client extends SerializationApi { // un-suffixed "std_deviation"/"variance" keys are the population values // (ES 6+); the "_sampling" keys are the sample values (ES 7.7+). // None for plain `value`-style metrics. - aggResultField: Option[String] = None + aggResultField: Option[String] = None, + // When several percentile columns coalesce into one ES `percentiles` + // aggregation, the delegates name the shared response node here (the owner + // column's aggName). extractMetrics reads `values[aggResultField]` from that + // node for the delegate column. None ⇒ this column reads its own node. + sourceAgg: Option[String] = None ) { def multivalued: Boolean = aggType == AggregationType.ArrayAgg || @@ -432,6 +441,8 @@ package object client extends SerializationApi { case ExtendedStatsKind.VarSamp => AggregationType.VarSamp case ExtendedStatsKind.VarPop => AggregationType.VarPop } + case p: PercentileAgg => + if (p.cont) AggregationType.PercentileCont else AggregationType.PercentileDisc case _ => throw new IllegalArgumentException(s"Unsupported aggregation type: ${agg.aggType}") } // `extended_stats` is multi-key — pick which one to project. Plain @@ -445,7 +456,10 @@ package object client extends SerializationApi { case VAR_SAMP => Some(ExtendedStatsKind.VarSamp.resultField) case VAR_POP => Some(ExtendedStatsKind.VarPop.resultField) case e: ExtendedStatsAgg => Some(e.kind.resultField) - case _ => None + // `percentiles` is multi-key — project the requested percentile (e.g. "99.0") + // from the response `values` object (see extractMetrics). + case p: PercentileAgg => Some(p.resultField) + case _ => None } ClientAggregation( agg.aggName, diff --git a/core/src/test/scala/app/softnetwork/elastic/client/ElasticConversionSpec.scala b/core/src/test/scala/app/softnetwork/elastic/client/ElasticConversionSpec.scala index 016302bc..165a65ed 100644 --- a/core/src/test/scala/app/softnetwork/elastic/client/ElasticConversionSpec.scala +++ b/core/src/test/scala/app/softnetwork/elastic/client/ElasticConversionSpec.scala @@ -703,6 +703,42 @@ class ElasticConversionSpec extends AnyFlatSpec with Matchers with ElasticConver } } + it should "project a percentile column via numeric-proximity when ES drifts the values key" in { + // Story 14.5 review: p=0.333 → requested percent 33.300000000000004 (IEEE-754 + // noise). ES echoes the key as "33.3", so the exact-key lookup misses; the + // numeric-proximity fallback must still populate the column. + val results = + """{ + | "took": 2, + | "timed_out": false, + | "hits": { "total": { "value": 5, "relation": "eq" }, "hits": [] }, + | "aggregations": { + | "p33": { "values": { "33.3": 4200.0 } } + | } + |}""".stripMargin + + val aggregations = ListMap( + "p33" -> ClientAggregation( + aggName = "p33", + aggType = AggregationType.PercentileCont, + distinct = false, + sourceField = "salary", + windowing = false, + bucketPath = "", + bucketRoot = "", + aggResultField = Some("33.300000000000004") + ) + ) + + parseResponse(results, ListMap.empty, aggregations) match { + case Success(rows) => + rows.size shouldBe 1 + rows.head("p33") shouldBe 4200.0 + case Failure(error) => + throw error + } + } + it should "parse window results with distinct partitions" in { val results = """ diff --git a/documentation/sql/dql_statements.md b/documentation/sql/dql_statements.md index 59ca4969..7e20d21b 100644 --- a/documentation/sql/dql_statements.md +++ b/documentation/sql/dql_statements.md @@ -305,6 +305,29 @@ All six map to a single Elasticsearch `extended_stats` aggregation per call; the `std_deviation`, `variance` for the population variants) is projected from the response. Sample variants require **Elasticsearch 7.7+**; population variants work on Elasticsearch 6+. +### Percentiles — `PERCENTILE_CONT` / `PERCENTILE_DISC` + +- `PERCENTILE_CONT(p) WITHIN GROUP (ORDER BY column)` — ANSI ordered-set aggregate (optionally with a top-level `GROUP BY`) +- `PERCENTILE_CONT(p) WITHIN GROUP (ORDER BY column) OVER (PARTITION BY ...)` — value column from `WITHIN GROUP`, partition from `OVER` +- `PERCENTILE_CONT(p) OVER (PARTITION BY ... ORDER BY column)` — value column from the `OVER` `ORDER BY` +- `PERCENTILE_CONT(column, p)` — column-first shorthand (many BI tools emit it) + +The percentile literal `p` is a value in `[0, 1]` (e.g. `0.99` for p99); a value outside that range is +rejected at parse time. The **value column** is given by the `ORDER BY` clause (`WITHIN GROUP` or `OVER`), +or the shorthand's first argument; **grouping** is given by `OVER (PARTITION BY ...)` or a top-level +`GROUP BY` (or neither — a single percentile over the whole result set). Both functions map to the +Elasticsearch `percentiles` aggregation (TDigest). Elasticsearch has no native discrete percentile, so +`PERCENTILE_DISC` is **continuous-backed** — it returns the same interpolated value as `PERCENTILE_CONT` +rather than the nearest actual data point. All forms work on Elasticsearch 6+. + +```sql +-- p99 request latency per endpoint (SRE latency analysis) +SELECT endpoint, + PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY duration_ms) AS p99 +FROM requests +GROUP BY endpoint; +``` + ### GROUP BY and HAVING ```sql diff --git a/documentation/sql/functions_aggregate.md b/documentation/sql/functions_aggregate.md index e8babf61..414940be 100644 --- a/documentation/sql/functions_aggregate.md +++ b/documentation/sql/functions_aggregate.md @@ -19,6 +19,7 @@ This page documents aggregate functions for summarizing and analyzing data. 7. [LAST_VALUE](#function-last_value) 8. [ARRAY_AGG](#function-array_agg) 9. [STDDEV / VARIANCE family](#function-stddev--variance-family) +10. [PERCENTILE_CONT / PERCENTILE_DISC](#function-percentile_cont--percentile_disc) --- @@ -1261,6 +1262,53 @@ FROM emp; --- +## Function: PERCENTILE_CONT / PERCENTILE_DISC + +**Description:** +Compute a percentile of a numeric column. `PERCENTILE_CONT` is continuous (interpolated); `PERCENTILE_DISC` is the discrete form. Both map to the Elasticsearch `percentiles` aggregation (TDigest). Elasticsearch has no native discrete percentile, so `PERCENTILE_DISC` is **continuous-backed** — it returns the same interpolated value as `PERCENTILE_CONT` rather than the nearest actual data point. + +**Syntax:** +```sql +PERCENTILE_CONT(p) WITHIN GROUP (ORDER BY column) +PERCENTILE_CONT(p) WITHIN GROUP (ORDER BY column) OVER (PARTITION BY partition_expr, ...) +PERCENTILE_CONT(p) OVER (PARTITION BY partition_expr, ... ORDER BY column) +PERCENTILE_CONT(column, p) +``` + +**Inputs:** +- `p` - Percentile fraction, a literal in `[0, 1]` (e.g. `0.99` for p99). Out-of-range values are rejected at parse time. +- `column` - Numeric value column. Supplied by the `ORDER BY` clause (`WITHIN GROUP` or `OVER`), or the shorthand's first argument. +- Grouping comes from `OVER (PARTITION BY ...)` or a top-level `GROUP BY` (or neither — one value over the whole result set). + +**Output:** +- `DOUBLE` + +**Behavior:** +- `NULL` values are ignored. +- Multiple percentile calls on the same value column may be coalesced into a single Elasticsearch aggregation. +- Works on Elasticsearch 6+. + +**Examples:** +```sql +-- p99 request latency per endpoint (SRE latency analysis) +SELECT endpoint, + PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY duration_ms) AS p99 +FROM requests +GROUP BY endpoint; + +-- OVER form: partition and value column both in OVER +SELECT name, + PERCENTILE_CONT(0.95) OVER (PARTITION BY department ORDER BY salary) AS p95 +FROM emp; + +-- Column-first shorthand (BI-tool friendly) +SELECT department, PERCENTILE_CONT(salary, 0.5) AS median +FROM emp +GROUP BY department; +``` + +--- + ## Aggregate Functions Summary | Function | Purpose | Input | Output | NULL Handling | @@ -1281,5 +1329,7 @@ FROM emp; | `VARIANCE(expr)` | Sample variance | Numeric | `DOUBLE` | Ignores NULLs | | `VAR_SAMP(expr)` | Sample variance | Numeric | `DOUBLE` | Ignores NULLs | | `VAR_POP(expr)` | Population variance | Numeric | `DOUBLE` | Ignores NULLs | +| `PERCENTILE_CONT(p)` | Continuous percentile | Numeric | `DOUBLE` | Ignores NULLs | +| `PERCENTILE_DISC(p)` | Discrete percentile | Numeric | `DOUBLE` | Ignores NULLs | [Back to index](README.md) diff --git a/documentation/sql/keywords.md b/documentation/sql/keywords.md index 7bd0b51e..d3b6a6b2 100644 --- a/documentation/sql/keywords.md +++ b/documentation/sql/keywords.md @@ -21,6 +21,8 @@ WHERE GROUP BY HAVING ORDER BY +NULLS FIRST +NULLS LAST OFFSET LIMIT ON @@ -54,7 +56,19 @@ OVER PARTITION BY FIRST_VALUE LAST_VALUE -ARRAY_AGG +ARRAY_AGG +STDDEV +STDDEV_POP +STDDEV_SAMP +VARIANCE +VAR_POP +VAR_SAMP +PERCENTILE_CONT +PERCENTILE_DISC +WITHIN GROUP +ROW_NUMBER +RANK +DENSE_RANK ## String functions UPPER @@ -105,7 +119,9 @@ END COALESCE ISNULL ISNOTNULL -NULLIF +NULLIF +GREATEST +LEAST ## Date/Time/Datetime/Timestamp functions [//]: # (YEAR ) diff --git a/es6/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/ElasticAggregation.scala b/es6/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/ElasticAggregation.scala index d49e42c1..498e46a2 100644 --- a/es6/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/ElasticAggregation.scala +++ b/es6/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/ElasticAggregation.scala @@ -44,6 +44,7 @@ import com.sksamuel.elastic4s.ElasticApi.{ maxAgg, minAgg, nestedAggregation, + percentilesAgg, sumAgg, termsAgg, topHitsAgg, @@ -224,6 +225,17 @@ object ElasticAggregation { extendedStatsAgg, (name, s) => extendedStatsAgg(name, sourceField).script(s) ) + case PERCENTILE_CONT | PERCENTILE_DISC => + // Both map to ES `percentiles` (TDigest). One call → one percent; + // the requested value column is `sourceField` (PercentileAgg.identifier). + val pct: Seq[Double] = th match { + case p: PercentileAgg => Seq(p.percent) + case _ => Seq.empty + } + aggWithFieldOrScript( + (name, field) => percentilesAgg(name, field).percents(pct), + (name, s) => percentilesAgg(name, sourceField).percents(pct).script(s) + ) case _ => val isRanking = th.isInstanceOf[RankingWindow] val limit = { diff --git a/es6/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/package.scala b/es6/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/package.scala index 6f3672e9..38e3e4a9 100644 --- a/es6/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/package.scala +++ b/es6/bridge/src/main/scala/app/softnetwork/elastic/sql/bridge/package.scala @@ -24,7 +24,7 @@ import app.softnetwork.elastic.sql.`type`.{ SQLVarchar } import app.softnetwork.elastic.sql.config.ElasticSqlConfig -import app.softnetwork.elastic.sql.function.aggregate.COUNT +import app.softnetwork.elastic.sql.function.aggregate.{COUNT, PercentileAgg} import app.softnetwork.elastic.sql.function.geo.{Distance, Meters} import app.softnetwork.elastic.sql.operator._ import app.softnetwork.elastic.sql.query._ @@ -41,6 +41,7 @@ import com.sksamuel.elastic4s.searches.aggs.{ AbstractAggregation, FilterAggregation, NestedAggregation, + PercentilesAggregation, TermsAggregation } import com.sksamuel.elastic4s.searches.queries.{BoolQuery, InnerHit, Query} @@ -451,6 +452,34 @@ package object bridge { request.orderBy.map(_.sorts).getOrElse(Seq.empty) ).minScore(request.score) + /** Merge percentile ElasticAggregations that share a value column / `cont` flag / partition into + * the FIRST of them (the owner): set the owner's ES `percentiles` `percents` to the group's + * sorted-distinct union and drop the delegates. Mirrors `SearchApi.toClientAggregations` via the + * shared [[PercentileAgg.coalescePlan]] so query and response agree on the owner node. + */ + private def coalescePercentileAggs( + aggs: Seq[ElasticAggregation] + ): Seq[ElasticAggregation] = { + val items = aggs.collect { + case ea if ea.aggType.isInstanceOf[PercentileAgg] => + ea.aggName -> ea.aggType.asInstanceOf[PercentileAgg] + } + if (items.size < 2) aggs + else { + val plan = PercentileAgg.coalescePlan(items) + aggs.flatMap { ea => + if (plan.isDelegate(ea.aggName)) None + else if (plan.isOwner(ea.aggName)) + Some( + ea.copy(agg = + ea.agg.asInstanceOf[PercentilesAggregation].percents(plan.mergedPercents(ea.aggName)) + ) + ) + else Some(ea) + } + } + } + implicit def requestToSearchRequest( request: SingleSearch )(implicit @@ -459,12 +488,14 @@ package object bridge { ): SearchRequest = { import request._ - val aggregations = request.aggregates.map( - ElasticAggregation( - _, - request.having.flatMap(_.criteria), - request.sorts, - request.sqlAggregations + val aggregations = coalescePercentileAggs( + request.aggregates.map( + ElasticAggregation( + _, + request.having.flatMap(_.criteria), + request.sorts, + request.sqlAggregations + ) ) ) diff --git a/es6/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala b/es6/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala index c18fcc41..b944a3b7 100644 --- a/es6/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala +++ b/es6/bridge/src/test/scala/app/softnetwork/elastic/sql/SQLQuerySpec.scala @@ -4487,4 +4487,109 @@ class SQLQuerySpec extends AnyFlatSpec with Matchers { query should include("\"extended_stats\":{\"field\":\"salary\"}") } + // === Story 14.5: PERCENTILE_CONT / PERCENTILE_DISC — percentiles translation === + + it should "translate PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY salary) GROUP BY department" in { + val select: ElasticSearchRequest = + SelectStatement( + """SELECT department, PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY salary) AS p99 + |FROM emp + |GROUP BY department""".stripMargin + ) + val query = select.query + query shouldBe + """{ + | "query": { "match_all": {} }, + | "size": 0, + | "_source": false, + | "aggs": { + | "department": { + | "terms": { "field": "department", "min_doc_count": 1 }, + | "aggs": { + | "p99": { "percentiles": { "field": "salary", "percents": [99.0] } } + | } + | } + | } + |}""".stripMargin.replaceAll("\\s+", "") + } + + it should "translate the PERCENTILE_CONT(salary, 0.95) shorthand to percentiles" in { + val select: ElasticSearchRequest = + SelectStatement( + """SELECT department, PERCENTILE_CONT(salary, 0.95) AS p95 + |FROM emp + |GROUP BY department""".stripMargin + ) + val query = select.query + query shouldBe + """{ + | "query": { "match_all": {} }, + | "size": 0, + | "_source": false, + | "aggs": { + | "department": { + | "terms": { "field": "department", "min_doc_count": 1 }, + | "aggs": { + | "p95": { "percentiles": { "field": "salary", "percents": [95.0] } } + | } + | } + | } + |}""".stripMargin.replaceAll("\\s+", "") + } + + it should "translate PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY salary) to percentiles" in { + val select: ElasticSearchRequest = + SelectStatement( + "SELECT PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY salary) AS median FROM emp" + ) + select.query shouldBe + """{ + | "query": { "match_all": {} }, + | "size": 0, + | "_source": false, + | "aggs": { + | "median": { "percentiles": { "field": "salary", "percents": [50.0] } } + | } + |}""".stripMargin.replaceAll("\\s+", "") + } + + it should "translate PERCENTILE_CONT(0.9) OVER (PARTITION BY department ORDER BY salary)" in { + val select: ElasticSearchRequest = + SelectStatement( + """SELECT name, PERCENTILE_CONT(0.9) OVER (PARTITION BY department ORDER BY salary) AS p90 + |FROM emp""".stripMargin + ) + val query = select.query + query should include("\"terms\":{\"field\":\"department\"") + query should include("\"percentiles\":{\"field\":\"salary\",\"percents\":[90.0]}") + } + + it should "coalesce multiple PERCENTILE_CONT on the same column into one percentiles agg" in { + val select: ElasticSearchRequest = + SelectStatement( + """SELECT department, + | PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY salary) AS p50, + | PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY salary) AS p95, + | PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY salary) AS p99 + |FROM emp + |GROUP BY department""".stripMargin + ) + val query = select.query + query shouldBe + """{ + | "query": { "match_all": {} }, + | "size": 0, + | "_source": false, + | "aggs": { + | "department": { + | "terms": { "field": "department", "min_doc_count": 1 }, + | "aggs": { + | "p50": { "percentiles": { "field": "salary", "percents": [50.0, 95.0, 99.0] } } + | } + | } + | } + |}""".stripMargin.replaceAll("\\s+", "") + "\"percentiles\"".r.findAllIn(query).length shouldBe 1 + } + } diff --git a/sql/src/main/scala/app/softnetwork/elastic/sql/function/aggregate/package.scala b/sql/src/main/scala/app/softnetwork/elastic/sql/function/aggregate/package.scala index ec199efc..51608f5d 100644 --- a/sql/src/main/scala/app/softnetwork/elastic/sql/function/aggregate/package.scala +++ b/sql/src/main/scala/app/softnetwork/elastic/sql/function/aggregate/package.scala @@ -59,6 +59,11 @@ package object aggregate { case object VAR_POP extends Expr("VAR_POP") with AggregateFunction with Window case object VAR_SAMP extends Expr("VAR_SAMP") with AggregateFunction with Window + // PERCENTILE_CONT / PERCENTILE_DISC — both translate to the ES `percentiles` + // aggregation (TDigest). See `PercentileAgg`. + case object PERCENTILE_CONT extends Expr("PERCENTILE_CONT") with AggregateFunction with Window + case object PERCENTILE_DISC extends Expr("PERCENTILE_DISC") with AggregateFunction with Window + sealed trait Window extends TokenRegex case object FIRST_VALUE extends Expr("FIRST_VALUE") with Window { @@ -449,6 +454,104 @@ package object aggregate { ) } + /** PERCENTILE_CONT / PERCENTILE_DISC. Both translate to a single ES `percentiles` aggregation + * (TDigest, continuous/interpolated). `identifier` is the VALUE COLUMN (sourced by the parser + * from the WITHIN GROUP / OVER `ORDER BY` clause, or the `(col, p)` shorthand's first argument). + * `cont` round-trips the SQL token. + * + * `resultField` is the response `values` key projected during extraction: ES renders the + * percentile as a Double, so 0.99 → percent 99.0 → key "99.0", 0.5 → "50.0". `orderBy` is None + * on purpose — the ORDER BY designates the value column and is consumed by the parser, not + * retained as a window-frame order. + */ + case class PercentileAgg( + identifier: Identifier, + cont: Boolean, + p: Double, + partitionBy: Seq[Identifier] = Seq.empty, + fields: Seq[Field] = Seq.empty + ) extends WindowFunction { + override def baseType: SQLType = SQLTypes.Double + + override def limit: Option[Limit] = None + + override def orderBy: Option[OrderBy] = None + + override def window: Window = if (cont) PERCENTILE_CONT else PERCENTILE_DISC + + /** ES `values` key for this percentile, e.g. 0.99 → "99.0". */ + def resultField: String = PercentileAgg.percentLabel(p) + + /** The percent passed to ES `percents`, e.g. 0.99 → 99.0. */ + def percent: Double = p * 100.0 + + override def withPartitionBy(partitionBy: Seq[Identifier]): WindowFunction = + this.copy(partitionBy = partitionBy) + + override def withFields(fields: Seq[Field]): WindowFunction = this.copy(fields = fields) + + override def update(request: SingleSearch): WindowFunction = super + .update(request) + .asInstanceOf[PercentileAgg] + .copy( + identifier = identifier.update(request) + ) + + // All five input forms normalize to a single canonical round-trip form. + override def sql: String = { + val fn = if (cont) PERCENTILE_CONT else PERCENTILE_DISC + val base = s"$fn($p) WITHIN GROUP (ORDER BY ${identifier.identifierName})" + if (partitionBy.nonEmpty) + s"$base $OVER ($PARTITION_BY ${partitionBy.map(_.identifierName).mkString(", ")})" + else base + } + } + + object PercentileAgg { + + /** Format the percentile the way ES labels it in the response `values` object. percent = p * + * 100, rendered via `Double.toString` — locale-independent (always a `.` decimal separator, + * never `,`) and identical to the value serialized into the ES request, so whole values become + * "50.0"/"99.0" and fractional ones keep their digits ("99.9"). Do NOT use `String.format`/the + * `f` interpolator here: they honour the default locale and would emit "50,0" under e.g. a + * French locale, breaking the response lookup. + */ + def percentLabel(p: Double): String = (p * 100.0).toString + + /** Coalescing plan: percentile aggregations that share the same value column, `cont` flag and + * partition are merged into ONE ES `percentiles` aggregation owned by the FIRST of them in + * SELECT order. `ownerOf` maps every percentile aggName to its owning aggName (an owner maps + * to itself); `mergedPercents` maps each owner aggName to the sorted distinct percents of its + * group. Built identically on the query side (bridge) and the response side (SearchApi) so + * both agree on which node is shared. + * + * @param items + * percentile `(aggName, PercentileAgg)` pairs in SELECT order. + */ + def coalescePlan(items: Seq[(String, PercentileAgg)]): PercentileCoalescePlan = { + val groups = items.groupBy { case (_, pa) => + (pa.identifier.identifierName, pa.cont, pa.partitionBy.map(_.identifierName)) + } + val ownerOf = groups.values.flatMap { g => + val owner = g.head._1 + g.map { case (name, _) => name -> owner } + }.toMap + val mergedPercents = groups.values.map { g => + g.head._1 -> g.map(_._2.percent).distinct.sorted + }.toMap + PercentileCoalescePlan(ownerOf, mergedPercents) + } + } + + /** Result of [[PercentileAgg.coalescePlan]] — see there. */ + final case class PercentileCoalescePlan( + ownerOf: Map[String, String], + mergedPercents: Map[String, Seq[Double]] + ) { + def isOwner(aggName: String): Boolean = ownerOf.get(aggName).contains(aggName) + def isDelegate(aggName: String): Boolean = ownerOf.get(aggName).exists(_ != aggName) + } + /** ROW_NUMBER / RANK / DENSE_RANK — ranking-style windows. * * ANSI requires `ORDER BY` inside the `OVER (...)` clause for ranking functions; the parser diff --git a/sql/src/main/scala/app/softnetwork/elastic/sql/parser/Parser.scala b/sql/src/main/scala/app/softnetwork/elastic/sql/parser/Parser.scala index fd0b00cf..fc281e01 100644 --- a/sql/src/main/scala/app/softnetwork/elastic/sql/parser/Parser.scala +++ b/sql/src/main/scala/app/softnetwork/elastic/sql/parser/Parser.scala @@ -1295,6 +1295,8 @@ trait Parser "variance", "var_pop", "var_samp", + "percentile_cont", + "percentile_disc", "case", "when", "then", diff --git a/sql/src/main/scala/app/softnetwork/elastic/sql/parser/function/aggregate/package.scala b/sql/src/main/scala/app/softnetwork/elastic/sql/parser/function/aggregate/package.scala index 75cd8b28..f114581c 100644 --- a/sql/src/main/scala/app/softnetwork/elastic/sql/parser/function/aggregate/package.scala +++ b/sql/src/main/scala/app/softnetwork/elastic/sql/parser/function/aggregate/package.scala @@ -157,6 +157,68 @@ package object aggregate { ExtendedStatsAgg(top._1, kind, top._2) } + def percentile_cont: PackratParser[AggregateFunction] = + PERCENTILE_CONT.regex ^^ (_ => PERCENTILE_CONT) + + def percentile_disc: PackratParser[AggregateFunction] = + PERCENTILE_DISC.regex ^^ (_ => PERCENTILE_DISC) + + // Numeric percentile literal in [0,1] — accepts decimals (0.99) and whole 0/1. + private[this] def percentile_literal: PackratParser[Double] = + (double ^^ (_.value)) | (long ^^ (_.value.toDouble)) + + // (col, p) shorthand OR (p) + private[this] def percentile_args: PackratParser[(Option[Identifier], Double)] = + (start ~> aggWithFunction ~ (separator ~> percentile_literal) <~ end ^^ { case id ~ p => + (Some(id), p) + }) | + (start ~> percentile_literal <~ end ^^ (p => (None, p))) + + // WITHIN GROUP ( ORDER BY ) -> value column(s). A percentile takes a + // SINGLE value column; a multi-column ORDER BY is rejected in `percentile_agg` + // (the full sort list is surfaced here so the guard can count columns). + private[this] def percentile_within_group: PackratParser[Seq[Identifier]] = + """(?i)\bwithin\b""".r ~> """(?i)\bgroup\b""".r ~> start ~> orderBy <~ end ^^ (_.sorts.map( + _.field + )) + + // value column(s) from OVER's ORDER BY, if present + private[this] def percentileOverValueCols( + ov: Option[(Seq[Identifier], Option[OrderBy], Option[Limit])] + ): Seq[Identifier] = + ov.flatMap(_._2).map(_.sorts.map(_.field)).getOrElse(Seq.empty) + + /** PERCENTILE_CONT / PERCENTILE_DISC — five forms, all normalizing to + * `PercentileAgg(valueColumn, cont, p, partitionBy)`. The value column comes from exactly one + * of: the `(col, p)` shorthand, `WITHIN GROUP (ORDER BY col)`, or `OVER (... ORDER BY col)`. + * The `^?` guard rejects (parse failure) when there is no value column, more than one source, + * or `p` outside `[0,1]`. + */ + def percentile_agg: PackratParser[WindowFunction] = + ((percentile_cont | percentile_disc) ~ percentile_args ~ + percentile_within_group.? ~ over.?) ^? ({ + case fn ~ ((shorthandCol, p)) ~ wg ~ ov if { + // exactly one value-column SOURCE, and that source names exactly ONE + // column (rejects no source, conflicting sources, and a multi-column + // WITHIN GROUP / OVER ORDER BY value list). + val sources = + Seq(shorthandCol.toSeq, wg.getOrElse(Seq.empty), percentileOverValueCols(ov)) + .filter(_.nonEmpty) + sources.size == 1 && sources.head.size == 1 && p >= 0.0 && p <= 1.0 + } => + val valueCol = + Seq(shorthandCol.toSeq, wg.getOrElse(Seq.empty), percentileOverValueCols(ov)) + .filter(_.nonEmpty) + .head + .head + val partitionBy = ov.map(_._1).getOrElse(Seq.empty) + PercentileAgg(valueCol, cont = fn == PERCENTILE_CONT, p, partitionBy) + }, { _ => + "PERCENTILE_CONT/DISC requires a literal percentile in [0,1] and exactly one value " + + "column (a single column via (column, p), WITHIN GROUP (ORDER BY col), or " + + "OVER (... ORDER BY col))" + }) + /** OVER clause variant used by ranking windows: ORDER BY is REQUIRED (ANSI). Falling through to * the optional-orderBy parser would let `ROW_NUMBER() OVER (PARTITION BY d)` parse and then * break at execution; rejecting at parse time is preferable. @@ -183,7 +245,7 @@ package object aggregate { def identifierWithWindowFunction: PackratParser[Identifier] = (first_value | last_value | array_agg | count_agg | min_agg | max_agg | avg_agg | sum_agg | - stddev_agg | variance_agg | + stddev_agg | variance_agg | percentile_agg | row_number | rank | dense_rank) ^^ { th => th.identifier.withFunctions(th +: th.identifier.functions) } diff --git a/sql/src/test/scala/app/softnetwork/elastic/sql/parser/ParserSpec.scala b/sql/src/test/scala/app/softnetwork/elastic/sql/parser/ParserSpec.scala index 308582e5..3e909649 100644 --- a/sql/src/test/scala/app/softnetwork/elastic/sql/parser/ParserSpec.scala +++ b/sql/src/test/scala/app/softnetwork/elastic/sql/parser/ParserSpec.scala @@ -3421,4 +3421,54 @@ class ParserSpec extends AnyFlatSpec with Matchers { result.toOption.get shouldBe RefreshLicense } + // === Story 14.5: PERCENTILE_CONT / PERCENTILE_DISC === + + private val percentileForms = Seq( + "SELECT department, PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY salary) AS p99 FROM emp GROUP BY department", + "SELECT name, PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY department) AS p99 FROM emp", + "SELECT name, PERCENTILE_CONT(0.99) OVER (PARTITION BY department ORDER BY salary) AS p99 FROM emp", + "SELECT department, PERCENTILE_CONT(salary, 0.99) AS p99 FROM emp GROUP BY department", + "SELECT PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY salary) AS median FROM emp" + ) + + percentileForms.zipWithIndex.foreach { case (sql, i) => + it should s"parse percentile form #${i + 1}" in { + Parser(sql).isRight shouldBe true + } + } + + it should "parse percentiles case-insensitively" in { + Parser( + "select department, percentile_cont(0.99) within group (order by salary) as p99 from emp group by department" + ).isRight shouldBe true + } + + it should "canonicalize a percentile to a stable re-parseable form" in { + val canon = Parser(percentileForms.head).toOption.map(_.sql).getOrElse("") + canon should not be empty + Parser(canon).toOption.map(_.sql).getOrElse("") shouldBe canon + } + + it should "reject a percentile literal outside [0,1]" in { + Parser( + "SELECT PERCENTILE_CONT(1.5) WITHIN GROUP (ORDER BY salary) AS x FROM emp" + ).isLeft shouldBe true + } + + it should "reject a percentile with no value column" in { + Parser("SELECT PERCENTILE_CONT(0.5) AS x FROM emp").isLeft shouldBe true + } + + it should "reject a multi-column WITHIN GROUP value source" in { + Parser( + "SELECT PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY salary, bonus) AS x FROM emp" + ).isLeft shouldBe true + } + + it should "reject a multi-column OVER ORDER BY value source" in { + Parser( + "SELECT name, PERCENTILE_CONT(0.5) OVER (PARTITION BY department ORDER BY salary, bonus) AS x FROM emp" + ).isLeft shouldBe true + } + } diff --git a/testkit/src/main/scala/app/softnetwork/elastic/client/WindowFunctionSpec.scala b/testkit/src/main/scala/app/softnetwork/elastic/client/WindowFunctionSpec.scala index 51d97e51..eadd93e1 100644 --- a/testkit/src/main/scala/app/softnetwork/elastic/client/WindowFunctionSpec.scala +++ b/testkit/src/main/scala/app/softnetwork/elastic/client/WindowFunctionSpec.scala @@ -2033,4 +2033,83 @@ trait WindowFunctionSpec } } + "PERCENTILE family" should "compute p50/p95/p99 of salary per department" in { + val results = client.searchAs[DepartmentPercentiles]( + """ + SELECT + department, + PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY salary) AS p50_salary, + PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY salary) AS p95_salary, + PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY salary) AS p99_salary + FROM emp + GROUP BY department + """ + ) + + results match { + case ElasticSuccess(rows) => + rows should not be empty + + val eng = rows.find(_.department == "Engineering").getOrElse(fail("no Engineering row")) + eng.p50_salary should not be empty + eng.p95_salary should not be empty + eng.p99_salary should not be empty + + // TDigest percentiles are monotonic non-decreasing: p50 ≤ p95 ≤ p99. + rows.foreach { r => + (r.p50_salary, r.p95_salary, r.p99_salary) match { + case (Some(p50), Some(p95), Some(p99)) => + p50 should be <= p95 + p95 should be <= p99 + log.info(f"${r.department}%-12s p50=$p50%10.2f p95=$p95%10.2f p99=$p99%10.2f") + case _ => + log.info(s"${r.department}: missing percentile values") + } + } + + case ElasticFailure(error) => + fail(s"Query failed: ${error.message}") + } + } + + "PERCENTILE coalescing" should "return correct per-alias values when several percentiles share a column" in { + // q1/q2/q3/p99 on `salary` all coalesce into ONE ES `percentiles` aggregation + // (verified at the query level in SQLQuerySpec). The deliberately + // non-sequential aliases prove each percentile is split back out of the + // shared node under its OWN alias — not positionally. + val results = client.searchAs[DepartmentQuartiles]( + """ + SELECT department, + PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY salary) AS q1, + PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY salary) AS q2, + PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY salary) AS q3, + PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY salary) AS p99 + FROM emp + GROUP BY department + """ + ) + + results match { + case ElasticSuccess(rows) => + rows should not be empty + rows.foreach { r => + (r.q1, r.q2, r.q3, r.p99) match { + case (Some(q1), Some(q2), Some(q3), Some(p99)) => + // Per-alias values, monotonic non-decreasing across the quartiles. + q1 should be <= q2 + q2 should be <= q3 + q3 should be <= p99 + log.info( + f"${r.department}%-12s q1=$q1%10.2f q2=$q2%10.2f q3=$q3%10.2f p99=$p99%10.2f" + ) + case _ => + fail(s"${r.department}: a coalesced percentile column came back null") + } + } + + case ElasticFailure(error) => + fail(s"Query failed: ${error.message}") + } + } + } diff --git a/testkit/src/main/scala/app/softnetwork/elastic/model/window/package.scala b/testkit/src/main/scala/app/softnetwork/elastic/model/window/package.scala index 5110caca..a9b83085 100644 --- a/testkit/src/main/scala/app/softnetwork/elastic/model/window/package.scala +++ b/testkit/src/main/scala/app/softnetwork/elastic/model/window/package.scala @@ -72,6 +72,29 @@ package object window { vs_salary: Option[Double] = None ) + /** Story 14.5 — PERCENTILE_CONT integration shape. Each field is one percentile projected from a + * single ES `percentiles` aggregation per call. + */ + case class DepartmentPercentiles( + department: String, + p50_salary: Option[Double] = None, + p95_salary: Option[Double] = None, + p99_salary: Option[Double] = None + ) + + /** Story 14.5 — percentile COALESCING shape. Several percentiles on the same column merge into + * one ES `percentiles` aggregation; the non-sequential aliases (q1/q2/q3/p99) prove each column + * is returned under its own alias (not positionally) when the value is split back out of the + * shared node. + */ + case class DepartmentQuartiles( + department: String, + q1: Option[Double] = None, + q2: Option[Double] = None, + q3: Option[Double] = None, + p99: Option[Double] = None + ) + case class DepartmentWithWindow( department: String, location: Option[String] = None, From d38ad912ea6f8c51aa47b45350fc67ae1851ec5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Manciot?= Date: Mon, 8 Jun 2026 07:19:18 +0200 Subject: [PATCH 6/6] =?UTF-8?q?feat(sql):=20close=20out=20Epic=2014=20?= =?UTF-8?q?=E2=80=94=20analytical-SQL=20parser=20regression=20audit=20(Sto?= =?UTF-8?q?ry=2014.6)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gating verification story for Epic 14 (Analytical SQL, R1). Stories 14.1–14.5 added 18 new SQL keywords to the parser; this story verifies they introduce no regression and reconciles the docs/help indexes. - Cross-compile green on Scala 2.12.20 + 2.13.16 - ParserSpec, ElasticConversionSpec, es8 + es6 bridge SQLQuerySpec all green - Keyword-collision audit clean: no fixture rename required (rank/row_number bare-identifier uses exist only in dead block-comment code; live fixtures alias to rnum/rk/drk) - Case-insensitivity confirmed via (?i) TokenRegex + new lowercased-keyword ParserSpec test covering all 18 Epic-14 keywords - WindowFunctionSpec live-green on all 4 ES majors (es6/es7/es8 default JDK, es9 under JDK 17) - Docs reconciled in both repos: backfilled functions_conditional.md (GREATEST/LEAST) and functions_aggregate.md (ROW_NUMBER/RANK/DENSE_RANK) - Help index reconciled: indexed real SAFE_CAST help file + added the REGEXP_LIKE help file that resolves a dangling string/_index.json ref - scalafmtAll clean Epic 14 stories: - #99 Story 14.1 — ORDER BY ... NULLS FIRST / NULLS LAST - #100 Story 14.2 — GREATEST / LEAST conditional functions - #101 Story 14.3 — ROW_NUMBER / RANK / DENSE_RANK ranking windows - #102 Story 14.4 — STDDEV / VARIANCE statistical aggregates - #103 Story 14.5 — PERCENTILE_CONT / PERCENTILE_DISC percentiles - #104 Story 14.6 — parser regression audit (this commit) Closed Issue #104 Co-Authored-By: Claude Opus 4.8 (1M context) --- .../help/functions/conversion/_index.json | 1 + .../help/functions/conversion/safe_cast.json | 39 ++++++++++ .../help/functions/string/regexp_like.json | 53 +++++++++++++ documentation/sql/functions_aggregate.md | 39 ++++++++++ documentation/sql/functions_conditional.md | 74 +++++++++++++++++++ .../elastic/sql/parser/ParserSpec.scala | 36 +++++++++ 6 files changed, 242 insertions(+) create mode 100644 core/src/main/resources/help/functions/conversion/safe_cast.json create mode 100644 core/src/main/resources/help/functions/string/regexp_like.json diff --git a/core/src/main/resources/help/functions/conversion/_index.json b/core/src/main/resources/help/functions/conversion/_index.json index b87da57d..df0c1d11 100644 --- a/core/src/main/resources/help/functions/conversion/_index.json +++ b/core/src/main/resources/help/functions/conversion/_index.json @@ -1,5 +1,6 @@ [ "cast.json", "try_cast.json", + "safe_cast.json", "convert.json" ] diff --git a/core/src/main/resources/help/functions/conversion/safe_cast.json b/core/src/main/resources/help/functions/conversion/safe_cast.json new file mode 100644 index 00000000..d6eff250 --- /dev/null +++ b/core/src/main/resources/help/functions/conversion/safe_cast.json @@ -0,0 +1,39 @@ +{ + "name": "SAFE_CAST", + "category": "Conversion", + "shortDescription": "Convert value to specified type, NULL on failure (BigQuery syntax)", + "syntax": [ + "SAFE_CAST(expression AS data_type)" + ], + "description": "Attempts to convert an expression to the specified data type. Returns NULL if conversion fails. BigQuery-compatible alias for TRY_CAST.", + "parameters": [ + { + "name": "expression", + "type": "ANY", + "description": "Value to convert", + "optional": false, + "defaultValue": null + }, + { + "name": "data_type", + "type": "KEYWORD", + "description": "Target data type", + "optional": false, + "defaultValue": null + } + ], + "returnType": "Specified data_type or NULL", + "examples": [ + { + "title": "Safe conversion", + "description": "Convert with NULL on failure", + "sql": "SELECT SAFE_CAST('abc' AS INT) as result" + } + ], + "notes": [ + "Equivalent to TRY_CAST", + "Provided for BigQuery compatibility" + ], + "seeAlso": ["TRY_CAST", "CAST"], + "aliases": [] +} diff --git a/core/src/main/resources/help/functions/string/regexp_like.json b/core/src/main/resources/help/functions/string/regexp_like.json new file mode 100644 index 00000000..5ffae938 --- /dev/null +++ b/core/src/main/resources/help/functions/string/regexp_like.json @@ -0,0 +1,53 @@ +{ + "name": "REGEXP_LIKE", + "category": "String", + "shortDescription": "Test whether a string matches a regular expression", + "syntax": [ + "REGEXP_LIKE(string, pattern)", + "REGEXP_LIKE(string, pattern, flags)" + ], + "description": "Returns TRUE when the source string matches the given regular expression pattern. An optional flags string controls matching behavior.", + "parameters": [ + { + "name": "string", + "type": "VARCHAR", + "description": "Source string", + "optional": false, + "defaultValue": null + }, + { + "name": "pattern", + "type": "VARCHAR", + "description": "Regular expression pattern", + "optional": false, + "defaultValue": null + }, + { + "name": "flags", + "type": "VARCHAR", + "description": "Optional match flags: 'i' (case-insensitive), 'n' (dotall, '.' matches newlines), 'c' (case-sensitive, the default)", + "optional": true, + "defaultValue": null + } + ], + "returnType": "BOOLEAN", + "examples": [ + { + "title": "Match a pattern", + "description": "Rows whose code starts with two letters followed by digits", + "sql": "SELECT code FROM products WHERE REGEXP_LIKE(code, '^[A-Z]{2}[0-9]+')" + }, + { + "title": "Case-insensitive match", + "description": "Match regardless of letter case using the 'i' flag", + "sql": "SELECT name FROM contacts WHERE REGEXP_LIKE(name, 'smith', 'i')" + } + ], + "notes": [ + "Uses Java regular expression syntax", + "Flags: 'i' = case-insensitive, 'n' = dotall, 'c' = case-sensitive (default)", + "REGEXP is an accepted alias for REGEXP_LIKE" + ], + "seeAlso": ["REPLACE", "POSITION"], + "aliases": ["REGEXP"] +} diff --git a/documentation/sql/functions_aggregate.md b/documentation/sql/functions_aggregate.md index 414940be..8a74355a 100644 --- a/documentation/sql/functions_aggregate.md +++ b/documentation/sql/functions_aggregate.md @@ -20,6 +20,7 @@ This page documents aggregate functions for summarizing and analyzing data. 8. [ARRAY_AGG](#function-array_agg) 9. [STDDEV / VARIANCE family](#function-stddev--variance-family) 10. [PERCENTILE_CONT / PERCENTILE_DISC](#function-percentile_cont--percentile_disc) +11. [ROW_NUMBER / RANK / DENSE_RANK (ranking windows)](#function-row_number--rank--dense_rank-ranking-windows) --- @@ -1309,6 +1310,41 @@ GROUP BY department; --- +## Function: ROW_NUMBER / RANK / DENSE_RANK (ranking windows) + +**Description:** +The three ANSI ranking window functions assign an ordinal to each row within a +partition, ordered by the `OVER (... ORDER BY ...)` clause: + +- `ROW_NUMBER()` — sequential 1-based ordinals (1, 2, 3, ...); no ties recognized. +- `RANK()` — ties share a rank and the next rank **skips** (1, 2, 2, 4, ...). +- `DENSE_RANK()` — ties share a rank and the next rank does **not** skip (1, 2, 2, 3, ...). + +`ORDER BY` is **required** inside `OVER`; `PARTITION BY` is optional (absent → the +whole result set is one partition). A `LIMIT N` inside `OVER` is pushed into the +Elasticsearch `top_hits` sub-aggregation, returning only the top-N rows per partition. + +**Syntax:** +```sql +ROW_NUMBER() OVER ([PARTITION BY ...] ORDER BY ... [LIMIT N]) +RANK() OVER ([PARTITION BY ...] ORDER BY ... [LIMIT N]) +DENSE_RANK() OVER ([PARTITION BY ...] ORDER BY ... [LIMIT N]) +``` + +**Example:** +```sql +SELECT name, salary, + ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) AS rn, + RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS rk, + DENSE_RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS dr +FROM emp; +``` + +> Full ranking-window details and top-N push-down examples are in +> [DQL statements — ranking windows](dql_statements.md). + +--- + ## Aggregate Functions Summary | Function | Purpose | Input | Output | NULL Handling | @@ -1331,5 +1367,8 @@ GROUP BY department; | `VAR_POP(expr)` | Population variance | Numeric | `DOUBLE` | Ignores NULLs | | `PERCENTILE_CONT(p)` | Continuous percentile | Numeric | `DOUBLE` | Ignores NULLs | | `PERCENTILE_DISC(p)` | Discrete percentile | Numeric | `DOUBLE` | Ignores NULLs | +| `ROW_NUMBER()` | Sequential ordinal | — | `BIGINT` | n/a (window) | +| `RANK()` | Rank, ties skip | — | `BIGINT` | n/a (window) | +| `DENSE_RANK()` | Rank, ties dense | — | `BIGINT` | n/a (window) | [Back to index](README.md) diff --git a/documentation/sql/functions_conditional.md b/documentation/sql/functions_conditional.md index c2fb2ca3..d601adba 100644 --- a/documentation/sql/functions_conditional.md +++ b/documentation/sql/functions_conditional.md @@ -394,4 +394,78 @@ SELECT FROM products ``` +--- + +### GREATEST + +Returns the largest value among the given numeric expressions. NULL arguments are ignored (PostgreSQL-style NULL handling); the result is NULL only if every argument is NULL. + +**Syntax:** +```sql +GREATEST(expression1, expression2, ...) +``` + +**Inputs:** +- One or more numeric expressions to compare + +**Output:** +- Numeric (widest input type), or NULL if every argument is NULL + +**Examples:** + +**1. Highest of multiple prices:** +```sql +SELECT GREATEST(price_us, price_eu, price_uk) AS max_price +FROM products +``` + +**2. Floor on a computed value:** +```sql +SELECT GREATEST(0, base_price - rebate) AS net +FROM orders +-- Never lets the net go below zero +``` + +**Notes:** +- Ignores NULL arguments (PostgreSQL-style NULL handling); returns NULL only if every argument is NULL. +- All arguments should be numeric and have comparable types. +- See also: `LEAST`, `COALESCE`. + +--- + +### LEAST + +Returns the smallest value among the given numeric expressions. NULL arguments are ignored (PostgreSQL-style NULL handling); the result is NULL only if every argument is NULL. + +**Syntax:** +```sql +LEAST(expression1, expression2, ...) +``` + +**Inputs:** +- One or more numeric expressions to compare + +**Output:** +- Numeric (widest input type), or NULL if every argument is NULL + +**Examples:** + +**1. Lowest of multiple prices:** +```sql +SELECT LEAST(price_us, price_eu, price_uk) AS min_price +FROM products +``` + +**2. Cap on a computed value:** +```sql +SELECT LEAST(rebate, base_price) AS applied_rebate +FROM orders +-- Never lets the rebate exceed the base price +``` + +**Notes:** +- Ignores NULL arguments (PostgreSQL-style NULL handling); returns NULL only if every argument is NULL. +- All arguments should be numeric and have comparable types. +- See also: `GREATEST`, `COALESCE`. + [Back to index](README.md) diff --git a/sql/src/test/scala/app/softnetwork/elastic/sql/parser/ParserSpec.scala b/sql/src/test/scala/app/softnetwork/elastic/sql/parser/ParserSpec.scala index 3e909649..3412872f 100644 --- a/sql/src/test/scala/app/softnetwork/elastic/sql/parser/ParserSpec.scala +++ b/sql/src/test/scala/app/softnetwork/elastic/sql/parser/ParserSpec.scala @@ -1061,6 +1061,42 @@ class ParserSpec extends AnyFlatSpec with Matchers { result.isLeft shouldBe true } + // Story 14.6 — case-insensitivity of the Epic-14 keywords (AC 4): every new + // reserved word resolves through the (?i) TokenRegex (and NULLS FIRST/LAST + // through its own (?i) production), so a fully-lowercased query must parse. + // percentile_cont is also covered in the 14.5 block; this test exercises all + // 18 Epic-14 keywords lowercased, including the stddev_pop/stddev_samp/ + // percentile_disc and nulls-last forms the original 14.6 test omitted. + it should "parse the Epic-14 window/stats/conditional keywords case-insensitively" in { + Parser( + "select name, rank() over (partition by department order by salary desc) as r from emp" + ).isRight shouldBe true + Parser( + "select name, row_number() over (order by salary desc) as rn from emp" + ).isRight shouldBe true + Parser( + "select name, dense_rank() over (partition by department order by salary desc) as dr from emp" + ).isRight shouldBe true + Parser( + "select department, stddev(salary) as sd, variance(salary) as v from emp group by department" + ).isRight shouldBe true + Parser( + "select department, var_pop(salary) as vp, var_samp(salary) as vs from emp group by department" + ).isRight shouldBe true + Parser( + "select department, stddev_pop(salary) as sdp, stddev_samp(salary) as sds from emp group by department" + ).isRight shouldBe true + Parser( + "select percentile_disc(0.5) within group (order by salary) as median from emp" + ).isRight shouldBe true + Parser( + "select greatest(price_us, price_eu) as hi, least(price_us, price_eu) as lo from products" + ).isRight shouldBe true + Parser( + "select * from emp order by salary asc nulls last" + ).isRight shouldBe true + } + it should "parse conversion function" in { val result = Parser(conversion) result.toOption