Skip to content

Commit a5a72cd

Browse files
authored
Merge branch 'main' into sycai_type_usage
2 parents 2e203ea + cfa1c8e commit a5a72cd

File tree

38 files changed

+940
-213
lines changed

38 files changed

+940
-213
lines changed

bigframes/bigquery/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@
6060
from bigframes.bigquery._operations.search import create_vector_index, vector_search
6161
from bigframes.bigquery._operations.sql import sql_scalar
6262
from bigframes.bigquery._operations.struct import struct
63+
from bigframes.bigquery._operations.table import create_external_table
6364
from bigframes.core.logging import log_adapter
6465

6566
_functions = [
@@ -104,6 +105,8 @@
104105
sql_scalar,
105106
# struct ops
106107
struct,
108+
# table ops
109+
create_external_table,
107110
]
108111

109112
_module = sys.modules[__name__]
@@ -155,6 +158,8 @@
155158
"sql_scalar",
156159
# struct ops
157160
"struct",
161+
# table ops
162+
"create_external_table",
158163
# Modules / SQL namespaces
159164
"ai",
160165
"ml",

bigframes/bigquery/_operations/ml.py

Lines changed: 150 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
from __future__ import annotations
1616

17-
from typing import cast, Mapping, Optional, Union
17+
from typing import cast, List, Mapping, Optional, Union
1818

1919
import bigframes_vendored.constants
2020
import google.cloud.bigquery
@@ -431,3 +431,152 @@ def transform(
431431
return bpd.read_gbq_query(sql)
432432
else:
433433
return session.read_gbq_query(sql)
434+
435+
436+
@log_adapter.method_logger(custom_base_name="bigquery_ml")
437+
def generate_text(
438+
model: Union[bigframes.ml.base.BaseEstimator, str, pd.Series],
439+
input_: Union[pd.DataFrame, dataframe.DataFrame, str],
440+
*,
441+
temperature: Optional[float] = None,
442+
max_output_tokens: Optional[int] = None,
443+
top_k: Optional[int] = None,
444+
top_p: Optional[float] = None,
445+
flatten_json_output: Optional[bool] = None,
446+
stop_sequences: Optional[List[str]] = None,
447+
ground_with_google_search: Optional[bool] = None,
448+
request_type: Optional[str] = None,
449+
) -> dataframe.DataFrame:
450+
"""
451+
Generates text using a BigQuery ML model.
452+
453+
See the `BigQuery ML GENERATE_TEXT function syntax
454+
<https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-generate-text>`_
455+
for additional reference.
456+
457+
Args:
458+
model (bigframes.ml.base.BaseEstimator or str):
459+
The model to use for text generation.
460+
input_ (Union[bigframes.pandas.DataFrame, str]):
461+
The DataFrame or query to use for text generation.
462+
temperature (float, optional):
463+
A FLOAT64 value that is used for sampling promiscuity. The value
464+
must be in the range ``[0.0, 1.0]``. A lower temperature works well
465+
for prompts that expect a more deterministic and less open-ended
466+
or creative response, while a higher temperature can lead to more
467+
diverse or creative results. A temperature of ``0`` is
468+
deterministic, meaning that the highest probability response is
469+
always selected.
470+
max_output_tokens (int, optional):
471+
An INT64 value that sets the maximum number of tokens in the
472+
generated text.
473+
top_k (int, optional):
474+
An INT64 value that changes how the model selects tokens for
475+
output. A ``top_k`` of ``1`` means the next selected token is the
476+
most probable among all tokens in the model's vocabulary. A
477+
``top_k`` of ``3`` means that the next token is selected from
478+
among the three most probable tokens by using temperature. The
479+
default value is ``40``.
480+
top_p (float, optional):
481+
A FLOAT64 value that changes how the model selects tokens for
482+
output. Tokens are selected from most probable to least probable
483+
until the sum of their probabilities equals the ``top_p`` value.
484+
For example, if tokens A, B, and C have a probability of 0.3, 0.2,
485+
and 0.1 and the ``top_p`` value is ``0.5``, then the model will
486+
select either A or B as the next token by using temperature. The
487+
default value is ``0.95``.
488+
flatten_json_output (bool, optional):
489+
A BOOL value that determines the content of the generated JSON column.
490+
stop_sequences (List[str], optional):
491+
An ARRAY<STRING> value that contains the stop sequences for the model.
492+
ground_with_google_search (bool, optional):
493+
A BOOL value that determines whether to ground the model with Google Search.
494+
request_type (str, optional):
495+
A STRING value that contains the request type for the model.
496+
497+
Returns:
498+
bigframes.pandas.DataFrame:
499+
The generated text.
500+
"""
501+
import bigframes.pandas as bpd
502+
503+
model_name, session = _get_model_name_and_session(model, input_)
504+
table_sql = _to_sql(input_)
505+
506+
sql = bigframes.core.sql.ml.generate_text(
507+
model_name=model_name,
508+
table=table_sql,
509+
temperature=temperature,
510+
max_output_tokens=max_output_tokens,
511+
top_k=top_k,
512+
top_p=top_p,
513+
flatten_json_output=flatten_json_output,
514+
stop_sequences=stop_sequences,
515+
ground_with_google_search=ground_with_google_search,
516+
request_type=request_type,
517+
)
518+
519+
if session is None:
520+
return bpd.read_gbq_query(sql)
521+
else:
522+
return session.read_gbq_query(sql)
523+
524+
525+
@log_adapter.method_logger(custom_base_name="bigquery_ml")
526+
def generate_embedding(
527+
model: Union[bigframes.ml.base.BaseEstimator, str, pd.Series],
528+
input_: Union[pd.DataFrame, dataframe.DataFrame, str],
529+
*,
530+
flatten_json_output: Optional[bool] = None,
531+
task_type: Optional[str] = None,
532+
output_dimensionality: Optional[int] = None,
533+
) -> dataframe.DataFrame:
534+
"""
535+
Generates text embedding using a BigQuery ML model.
536+
537+
See the `BigQuery ML GENERATE_EMBEDDING function syntax
538+
<https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-generate-embedding>`_
539+
for additional reference.
540+
541+
Args:
542+
model (bigframes.ml.base.BaseEstimator or str):
543+
The model to use for text embedding.
544+
input_ (Union[bigframes.pandas.DataFrame, str]):
545+
The DataFrame or query to use for text embedding.
546+
flatten_json_output (bool, optional):
547+
A BOOL value that determines the content of the generated JSON column.
548+
task_type (str, optional):
549+
A STRING value that specifies the intended downstream application task.
550+
Supported values are:
551+
- `RETRIEVAL_QUERY`
552+
- `RETRIEVAL_DOCUMENT`
553+
- `SEMANTIC_SIMILARITY`
554+
- `CLASSIFICATION`
555+
- `CLUSTERING`
556+
- `QUESTION_ANSWERING`
557+
- `FACT_VERIFICATION`
558+
- `CODE_RETRIEVAL_QUERY`
559+
output_dimensionality (int, optional):
560+
An INT64 value that specifies the size of the output embedding.
561+
562+
Returns:
563+
bigframes.pandas.DataFrame:
564+
The generated text embedding.
565+
"""
566+
import bigframes.pandas as bpd
567+
568+
model_name, session = _get_model_name_and_session(model, input_)
569+
table_sql = _to_sql(input_)
570+
571+
sql = bigframes.core.sql.ml.generate_embedding(
572+
model_name=model_name,
573+
table=table_sql,
574+
flatten_json_output=flatten_json_output,
575+
task_type=task_type,
576+
output_dimensionality=output_dimensionality,
577+
)
578+
579+
if session is None:
580+
return bpd.read_gbq_query(sql)
581+
else:
582+
return session.read_gbq_query(sql)
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import annotations
16+
17+
from typing import Mapping, Optional, Union
18+
19+
import bigframes_vendored.constants
20+
import google.cloud.bigquery
21+
import pandas as pd
22+
23+
import bigframes.core.logging.log_adapter as log_adapter
24+
import bigframes.core.sql.table
25+
import bigframes.session
26+
27+
28+
def _get_table_metadata(
29+
*,
30+
bqclient: google.cloud.bigquery.Client,
31+
table_name: str,
32+
) -> pd.Series:
33+
table_metadata = bqclient.get_table(table_name)
34+
table_dict = table_metadata.to_api_repr()
35+
return pd.Series(table_dict)
36+
37+
38+
@log_adapter.method_logger(custom_base_name="bigquery_table")
39+
def create_external_table(
40+
table_name: str,
41+
*,
42+
replace: bool = False,
43+
if_not_exists: bool = False,
44+
columns: Optional[Mapping[str, str]] = None,
45+
partition_columns: Optional[Mapping[str, str]] = None,
46+
connection_name: Optional[str] = None,
47+
options: Mapping[str, Union[str, int, float, bool, list]],
48+
session: Optional[bigframes.session.Session] = None,
49+
) -> pd.Series:
50+
"""
51+
Creates a BigQuery external table.
52+
53+
See the `BigQuery CREATE EXTERNAL TABLE DDL syntax
54+
<https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/data-definition-language#create_external_table_statement>`_
55+
for additional reference.
56+
57+
Args:
58+
table_name (str):
59+
The name of the table in BigQuery.
60+
replace (bool, default False):
61+
Whether to replace the table if it already exists.
62+
if_not_exists (bool, default False):
63+
Whether to ignore the error if the table already exists.
64+
columns (Mapping[str, str], optional):
65+
The table's schema.
66+
partition_columns (Mapping[str, str], optional):
67+
The table's partition columns.
68+
connection_name (str, optional):
69+
The connection to use for the table.
70+
options (Mapping[str, Union[str, int, float, bool, list]]):
71+
The OPTIONS clause, which specifies the table options.
72+
session (bigframes.session.Session, optional):
73+
The session to use. If not provided, the default session is used.
74+
75+
Returns:
76+
pandas.Series:
77+
A Series with object dtype containing the table metadata. Reference
78+
the `BigQuery Table REST API reference
79+
<https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#Table>`_
80+
for available fields.
81+
"""
82+
import bigframes.pandas as bpd
83+
84+
sql = bigframes.core.sql.table.create_external_table_ddl(
85+
table_name=table_name,
86+
replace=replace,
87+
if_not_exists=if_not_exists,
88+
columns=columns,
89+
partition_columns=partition_columns,
90+
connection_name=connection_name,
91+
options=options,
92+
)
93+
94+
if session is None:
95+
bpd.read_gbq_query(sql)
96+
session = bpd.get_global_session()
97+
assert (
98+
session is not None
99+
), f"Missing connection to BigQuery. Please report how you encountered this error at {bigframes_vendored.constants.FEEDBACK_LINK}."
100+
else:
101+
session.read_gbq_query(sql)
102+
103+
return _get_table_metadata(bqclient=session.bqclient, table_name=table_name)

bigframes/bigquery/ml.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
create_model,
2424
evaluate,
2525
explain_predict,
26+
generate_embedding,
27+
generate_text,
2628
global_explain,
2729
predict,
2830
transform,
@@ -35,4 +37,6 @@
3537
"explain_predict",
3638
"global_explain",
3739
"transform",
40+
"generate_text",
41+
"generate_embedding",
3842
]

bigframes/core/compile/sqlglot/aggregations/unary_compiler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -527,7 +527,7 @@ def _(
527527
else:
528528
result = apply_window_if_present(result, window)
529529

530-
if op.should_floor_result:
530+
if op.should_floor_result or column.dtype == dtypes.TIMEDELTA_DTYPE:
531531
result = sge.Cast(this=sge.func("FLOOR", result), to="INT64")
532532
return result
533533

bigframes/core/compile/sqlglot/compiler.py

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,6 @@
4242
def compile_sql(request: configs.CompileRequest) -> configs.CompileResult:
4343
"""Compiles a BigFrameNode according to the request into SQL using SQLGlot."""
4444

45-
# Generator for unique identifiers.
46-
uid_gen = guid.SequentialUIDGenerator()
4745
output_names = tuple((expression.DerefOp(id), id.sql) for id in request.node.ids)
4846
result_node = nodes.ResultNode(
4947
request.node,
@@ -62,12 +60,8 @@ def compile_sql(request: configs.CompileRequest) -> configs.CompileResult:
6260
)
6361
if request.sort_rows:
6462
result_node = typing.cast(nodes.ResultNode, rewrite.column_pruning(result_node))
65-
result_node = _remap_variables(result_node, uid_gen)
66-
result_node = typing.cast(
67-
nodes.ResultNode, rewrite.defer_selection(result_node)
68-
)
6963
encoded_type_refs = data_type_logger.encode_type_refs(result_node)
70-
sql = _compile_result_node(result_node, uid_gen)
64+
sql = _compile_result_node(result_node)
7165
return configs.CompileResult(
7266
sql,
7367
result_node.schema.to_bigquery(),
@@ -78,9 +72,6 @@ def compile_sql(request: configs.CompileRequest) -> configs.CompileResult:
7872
ordering: typing.Optional[bf_ordering.RowOrdering] = result_node.order_by
7973
result_node = dataclasses.replace(result_node, order_by=None)
8074
result_node = typing.cast(nodes.ResultNode, rewrite.column_pruning(result_node))
81-
82-
result_node = _remap_variables(result_node, uid_gen)
83-
result_node = typing.cast(nodes.ResultNode, rewrite.defer_selection(result_node))
8475
encoded_type_refs = data_type_logger.encode_type_refs(result_node)
8576
sql = _compile_result_node(result_node, uid_gen)
8677
# Return the ordering iff no extra columns are needed to define the row order
@@ -105,11 +96,16 @@ def _remap_variables(
10596
return typing.cast(nodes.ResultNode, result_node)
10697

10798

108-
def _compile_result_node(
109-
root: nodes.ResultNode, uid_gen: guid.SequentialUIDGenerator
110-
) -> str:
99+
def _compile_result_node(root: nodes.ResultNode) -> str:
100+
# Create UIDs to standardize variable names and ensure consistent compilation
101+
# of nodes using the same generator.
102+
uid_gen = guid.SequentialUIDGenerator()
103+
root = _remap_variables(root, uid_gen)
104+
root = typing.cast(nodes.ResultNode, rewrite.defer_selection(root))
105+
111106
# Have to bind schema as the final step before compilation.
112107
root = typing.cast(nodes.ResultNode, schema_binding.bind_schema_to_tree(root))
108+
113109
selected_cols: tuple[tuple[str, sge.Expression], ...] = tuple(
114110
(name, scalar_compiler.scalar_op_compiler.compile_expression(ref))
115111
for ref, name in root.output_cols
@@ -135,7 +131,6 @@ def _compile_result_node(
135131
return sqlglot_ir.sql
136132

137133

138-
@functools.lru_cache(maxsize=5000)
139134
def compile_node(
140135
node: nodes.BigFrameNode, uid_gen: guid.SequentialUIDGenerator
141136
) -> ir.SQLGlotIR:
@@ -274,10 +269,16 @@ def compile_concat(node: nodes.ConcatNode, *children: ir.SQLGlotIR) -> ir.SQLGlo
274269
assert len(children) >= 1
275270
uid_gen = children[0].uid_gen
276271

277-
output_ids = [id.sql for id in node.output_ids]
272+
# BigQuery `UNION` query takes the column names from the first `SELECT` clause.
273+
default_output_ids = [field.id.sql for field in node.child_nodes[0].fields]
274+
output_aliases = [
275+
(default_output_id, output_id.sql)
276+
for default_output_id, output_id in zip(default_output_ids, node.output_ids)
277+
]
278+
278279
return ir.SQLGlotIR.from_union(
279280
[child.expr for child in children],
280-
output_ids=output_ids,
281+
output_aliases=output_aliases,
281282
uid_gen=uid_gen,
282283
)
283284

0 commit comments

Comments
 (0)