From 68980bc5c4f5374ce5e9342fdcd5320071eb462f Mon Sep 17 00:00:00 2001 From: aprilkim Date: Sat, 30 May 2026 20:55:29 -0700 Subject: [PATCH 1/8] [ai/azure-ai-projects] make traces-for-evaluation sample self-contained Rewrite samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py so it works without prior setup. By default, the sample now creates a temporary Foundry agent, runs three multi-turn Widgets & Gizmos conversations against it with GenAI content tracing enabled (configure_azure_monitor + AIProjectInstrumentor with enable_content_recording=True), waits for App Insights ingestion, then submits the existing data-generation job over a window that exactly brackets the seeded traces. The temporary agent, seeded conversations, generated dataset, and data-generation job are all cleaned up in a best-effort finally block. Users who already have an agent with traces can opt into bring-your-own-agent mode by setting FOUNDRY_AGENT_NAME; in that mode the sample skips agent creation, trace seeding, and ingestion wait and uses the existing FOUNDRY_TRACES_WINDOW_DAYS look-back window (default 7 days). New seeding knobs (TRACE_SEEDING_CONVERSATIONS, TRACE_SEEDING_TURNS, TRACE_INGESTION_WAIT_SECONDS) make timing tunable per environment. Validated end-to-end against the build26-bug-bash project on gpt-5.1: self-contained run produced 15 generated samples and cleaned up all temporary resources successfully. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ...et_generation_job_traces_for_evaluation.py | 497 ++++++++++++++---- 1 file changed, 407 insertions(+), 90 deletions(-) diff --git a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py index 9b2ca86a89bf..e7c60468b4c7 100644 --- a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py +++ b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py @@ -7,47 +7,79 @@ """ DESCRIPTION: Generates an evaluation dataset from an agent's recent conversation - traces. The sample: + traces. The sample runs in one of two modes: - 1. Creates a `DataGenerationJob` (scenario=EVALUATION, type=traces) that - reads spans from Application Insights for an existing agent within a + * Self-contained mode (default): Creates a temporary Foundry agent, + runs a few sample conversations against it with GenAI content + tracing enabled so spans flow to Application Insights, waits for + ingestion, then runs the data generation job. The temporary agent + and conversations are deleted at the end. Use this mode to try the + sample without preparing anything in advance. + * Bring-your-own-agent mode (BYO): Set FOUNDRY_AGENT_NAME to point at + an existing agent that already has recent conversation traces. The + sample skips agent creation and trace seeding and uses your agent + as-is. + + In both modes, the sample: + 1. Creates a `DataGenerationJob` (scenario=EVALUATION, type=traces) + that reads spans from Application Insights for the agent within a time window and synthesizes question / answer pairs into a new versioned Dataset. - 2. Polls the job to completion and resolves the resulting `DatasetVersion`. - 3. Cleans up the generated dataset and the data generation job. + 2. Polls the job to completion and resolves the resulting + `DatasetVersion`. + 3. Cleans up the generated dataset, the data generation job, and + (in self-contained mode) the temporary agent and conversations. - The Traces source consumes existing telemetry, so no `model_options` are - required — the service derives samples directly from the agent's traces. - The agent must have at least one trace recorded within the configured - look-back window or the job will succeed with zero generated samples. + The Traces source consumes existing telemetry, so no `model_options` + are required — the service derives samples directly from the agent's + traces. USAGE: python sample_dataset_generation_job_traces_for_evaluation.py Before running the sample: - pip install "azure-ai-projects>=2.2.0" azure-identity python-dotenv + pip install "azure-ai-projects>=2.2.0" azure-identity python-dotenv \\ + azure-monitor-opentelemetry azure-core-tracing-opentelemetry + + (The two telemetry packages are only required for self-contained mode.) Set these environment variables with your own values: - 1) FOUNDRY_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as found - in the overview page of your Microsoft Foundry project. - 2) FOUNDRY_AGENT_NAME - Required. The name of an agent (Foundry Agent or - OpenTelemetry-instrumented third-party agent) that has recent - conversation traces in Application Insights. - 3) DATASET_NAME - Optional. Name to assign to the generated output dataset. - Defaults to `traces-eval-sample`. The service caps the rendered output - name at 50 characters, so keep custom values short — the sample appends - a unique run id suffix. - 4) FOUNDRY_TRACES_WINDOW_DAYS - Optional. How far back, in days, to look for - agent traces. Defaults to 7. - 5) POLL_INTERVAL_SECONDS - Optional. Number of seconds to sleep between status - polls for the data generation job. Defaults to 10. + 1) FOUNDRY_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as + found in the overview page of your Microsoft Foundry project. + 2) FOUNDRY_AGENT_NAME - Optional. The name of an existing agent (Foundry + Agent or OpenTelemetry-instrumented third-party agent) that already + has recent conversation traces in Application Insights. If set, the + sample skips agent creation and trace seeding and uses this agent. + 3) FOUNDRY_MODEL_NAME - Required for self-contained mode. The Azure OpenAI + deployment name used to drive the temporary agent during trace + seeding. Ignored when FOUNDRY_AGENT_NAME is set. + 4) DATASET_NAME - Optional. Name to assign to the generated output + dataset. Defaults to `traces-eval-sample`. The service caps the + rendered output name at 50 characters, so keep custom values short — + the sample appends a unique run id suffix. + 5) FOUNDRY_TRACES_WINDOW_DAYS - Optional. How far back, in days, to look + for agent traces when in BYO mode. Defaults to 7. Ignored in + self-contained mode (the sample uses an exact window covering the + seeded traces). + 6) POLL_INTERVAL_SECONDS - Optional. Number of seconds to sleep between + status polls for the data generation job. Defaults to 10. + 7) TRACE_SEEDING_CONVERSATIONS - Optional. Number of conversations to + seed in self-contained mode. Defaults to 3. + 8) TRACE_SEEDING_TURNS - Optional. Turns per seeded conversation in + self-contained mode. Defaults to 5. + 9) TRACE_INGESTION_WAIT_SECONDS - Optional. Seconds to wait after seeding + for Application Insights to ingest the emitted spans before + submitting the data generation job. Defaults to 180. """ +import importlib import os +import sys import time import uuid from datetime import datetime, timedelta, timezone +from typing import List, Optional from dotenv import load_dotenv @@ -61,18 +93,113 @@ DatasetDataGenerationJobOutput, DatasetVersion, JobStatus, + PromptAgentDefinition, TracesDataGenerationJobOptions, TracesDataGenerationJobSource, ) load_dotenv() +# Persona used when seeding traces in self-contained mode. Mirrors the +# Widgets & Gizmos persona from +# sample_dataset_generation_job_simpleqna_with_agent_source.py so the +# generated traces have substantive multi-turn content the data generation +# service can synthesize useful eval samples from. +AGENT_INSTRUCTIONS = """\ +You are the Widgets & Gizmos customer-support agent. Help customers with +returns, warranty claims, repairs, product specifications, compatibility, +and ordering for Widgets, Gizmos, Sprockets, and accessories. + +Use this knowledge base when answering. Cite the relevant policy or spec +directly when you can. + +Returns + * Unopened products may be returned within 30 days for a full refund. + * Opened products may be returned within 14 days for a refund minus a + 10% restocking fee. Defective products may be returned within 90 days + at no cost. + * Refunds are processed within 5-7 business days after the return is + received and inspected. + * Items lost in shipping should be reported within 21 days of the order + date; we re-ship at no cost. + +Warranty + * Standard products carry a 1-year limited warranty against + manufacturing defects. + * The Deluxe Sprocket carries a 5-year limited warranty. + * Warranty repairs are free. Customer ships the unit to us prepaid; we + cover return shipping. Typical turnaround is 10-14 business days. + +Specifications + * Standard Widget: 4 inches, blue or red, weighs 6oz, made of aluminum. + * Compact Widget: 2 inches, gray only, weighs 3oz, made of aluminum. + * Gizmo: 6 inches, available in green, weighs 10oz, made of stainless + steel and ABS plastic. Compatible with all Sprocket Adapter v2 mounts. + * Sprocket Adapter v2: universal mount that fits Widgets, Gizmos, and + third-party 1/4-20 hardware. + +Pricing & bundles + * Standard Widget: $19.99 each, bundle of 10 for $149.99. + * Gizmo: $34.99 each, bundle of 5 for $129.99. + * Deluxe Sprocket: $79.99 each. + +If you do not know the answer, say so and offer to escalate. Be concise. +""" + +# Multi-turn conversation arcs used to seed traces. Each inner list is one +# conversation; the sample runs each turn against the temporary agent. +SEEDING_CONVERSATION_ARCS = [ + [ + "Hi, I need to return a defective Standard Widget.", + "I bought it 45 days ago. Is it still eligible for a refund?", + "What about a Gizmo I ordered but never received - it has been 3 weeks?", + "Can I get a refund instead of a replacement shipment?", + "How long will the refund take to show up on my card?", + ], + [ + "Does the Deluxe Sprocket come with a warranty?", + "What exactly does the warranty cover?", + "My Deluxe Sprocket stopped turning after 6 months - what should I do?", + "Do I have to pay for return shipping on a warranty claim?", + "How long do warranty repairs usually take?", + ], + [ + "What is the difference between a Standard Widget and a Compact Widget?", + "Is the Compact Widget compatible with the Sprocket Adapter v2?", + "What colors and sizes are Gizmos available in?", + "How much is a bundle of 10 Standard Widgets?", + "Do you carry any third-party accessories that fit the Sprocket Adapter v2?", + ], +] + endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"] -agent_name = os.environ["FOUNDRY_AGENT_NAME"] +provided_agent_name = os.environ.get("FOUNDRY_AGENT_NAME", "").strip() dataset_name = os.environ.get("DATASET_NAME", "traces-eval-sample") -traces_window_days = int(os.environ.get("FOUNDRY_TRACES_WINDOW_DAYS", "7")) poll_interval_seconds = int(os.environ.get("POLL_INTERVAL_SECONDS", "10")) +# Self-contained mode is enabled unless the user pointed at an existing agent. +seed_traces = not provided_agent_name + +# Window default differs by mode: in self-contained mode we compute the +# window exactly around the seeded traces (so this knob is ignored). +traces_window_days = int(os.environ.get("FOUNDRY_TRACES_WINDOW_DAYS", "7")) + +# Seeding knobs (only used when seed_traces is True). +trace_seeding_conversations = int( + os.environ.get("TRACE_SEEDING_CONVERSATIONS", str(len(SEEDING_CONVERSATION_ARCS))) +) +trace_seeding_turns = int( + os.environ.get("TRACE_SEEDING_TURNS", str(len(SEEDING_CONVERSATION_ARCS[0]))) +) +trace_ingestion_wait_seconds = int(os.environ.get("TRACE_INGESTION_WAIT_SECONDS", "180")) + +if seed_traces and "FOUNDRY_MODEL_NAME" not in os.environ: + raise EnvironmentError( + "Self-contained mode requires FOUNDRY_MODEL_NAME (the Azure OpenAI deployment " + "name used to drive the temporary agent). Either set FOUNDRY_MODEL_NAME or set " + "FOUNDRY_AGENT_NAME to use an existing agent with traces." + ) + # Unique per-run output dataset name so repeated runs do not collide. # Output names are capped at 50 characters by the service. run_id = f"{datetime.now(tz=timezone.utc).strftime('%y%m%d%H%M%S')}-{uuid.uuid4().hex[:4]}" @@ -83,78 +210,268 @@ f"Lower DATASET_NAME (currently `{dataset_name}`) so that `-` fits within 50 characters." ) -# Trace look-back window: now - `traces_window_days` ... now. -end_time = datetime.now(tz=timezone.utc) -start_time = end_time - timedelta(days=traces_window_days) +# Agent name used to read traces. In self-contained mode we use a unique +# per-run name so concurrent runs do not collide and so we know any matched +# traces belong to this run. +agent_name = provided_agent_name or f"traces-eval-sample-{run_id}" TERMINAL_STATUSES = {JobStatus.SUCCEEDED, JobStatus.FAILED, JobStatus.CANCELLED} + +def _safe_console(text: str) -> str: + """Encode `text` so it always prints on the active stdout encoding. + + Some Windows consoles default to cp1252, which cannot encode characters + the model may emit (e.g. smart quotes, non-breaking hyphens). We replace + any unencodable code points with `?` so a preview line never crashes the + sample. + """ + encoding = getattr(sys.stdout, "encoding", None) or "utf-8" + return text.encode(encoding, errors="replace").decode(encoding, errors="replace") + + +def _seed_agent_traces( + project_client: AIProjectClient, + agent_name_to_use: str, + agent_id_to_use: str, + conversation_count: int, + turns_per_conversation: int, + conversation_ids: List[str], +) -> None: + """Run a few conversations against the agent so GenAI spans flow to App Insights. + + Created conversation IDs are appended to `conversation_ids` as each + conversation is created, so the caller can clean them up even if seeding + raises mid-way through. + """ + arcs = SEEDING_CONVERSATION_ARCS + with project_client.get_openai_client() as openai_client: + for ci in range(conversation_count): + arc = arcs[ci % len(arcs)] + conversation = openai_client.conversations.create() + conversation_ids.append(conversation.id) + print(f" - conversation {ci + 1}/{conversation_count} (id: {conversation.id})") + for ti in range(turns_per_conversation): + prompt = arc[ti % len(arc)] + response = openai_client.responses.create( + conversation=conversation.id, + input=prompt, + extra_body={ + "agent_reference": { + "name": agent_name_to_use, + "id": agent_id_to_use, + "type": "agent_reference", + } + }, + ) + preview = (response.output_text or "").replace("\n", " ") + if len(preview) > 80: + preview = preview[:77] + "..." + print(_safe_console(f" turn {ti + 1}: {prompt}")) + print(_safe_console(f" response: {preview}")) + + +mode_label = ( + "self-contained (will create a temporary agent and seed traces)" + if seed_traces + else f"bring-your-own-agent (`{provided_agent_name}`)" +) +print(f"Mode: {mode_label}.") + with ( DefaultAzureCredential() as credential, AIProjectClient(endpoint=endpoint, credential=credential) as project_client, ): - # ------------------------------------------------------------------ - # 1. Submit a data generation job that reads agent traces. - # ------------------------------------------------------------------ - print(f"Create a data generation job from traces for agent `{agent_name}` (window: {traces_window_days} day(s)).") - job = DataGenerationJob( - inputs=DataGenerationJobInputs( - name=f"traces-eval-{run_id}", - scenario=DataGenerationJobScenario.EVALUATION, - sources=[ - TracesDataGenerationJobSource( - description="Application Insights conversation traces for the Foundry agent.", - agent_name=agent_name, - start_time=start_time, - end_time=end_time, + created_agent = None + conversation_ids: List[str] = [] + seed_start: Optional[datetime] = None + + try: + if seed_traces: + # -------------------------------------------------------------- + # 0a. Wire up Azure Monitor + GenAI instrumentation so calls to + # responses.create emit semantic GenAI spans (with message + # content) to Application Insights. + # -------------------------------------------------------------- + try: + configure_azure_monitor = importlib.import_module( + "azure.monitor.opentelemetry" + ).configure_azure_monitor + AIProjectInstrumentor = importlib.import_module( + "azure.ai.projects.telemetry" + ).AIProjectInstrumentor + except ImportError as exc: + raise ImportError( + "Self-contained mode requires the `azure-monitor-opentelemetry` and " + "`azure-core-tracing-opentelemetry` packages. Install them with " + "`pip install azure-monitor-opentelemetry azure-core-tracing-opentelemetry` " + "or set FOUNDRY_AGENT_NAME to use an existing agent with traces." + ) from exc + + # AIProjectInstrumentor requires this env var be set BEFORE + # instrument() is called. We force it on (not setdefault) so the + # temporary agent's calls always produce GenAI spans the data-gen + # service can read. + os.environ["AZURE_EXPERIMENTAL_ENABLE_GENAI_TRACING"] = "true" + + print("Fetch Application Insights connection string and configure Azure Monitor exporter.") + connection_string = project_client.telemetry.get_application_insights_connection_string() + configure_azure_monitor(connection_string=connection_string) + AIProjectInstrumentor().instrument(enable_content_recording=True) + + # -------------------------------------------------------------- + # 0b. Create the temporary agent. + # -------------------------------------------------------------- + model_deployment = os.environ["FOUNDRY_MODEL_NAME"] + print(f"Create temporary agent `{agent_name}` (model: `{model_deployment}`).") + created_agent = project_client.agents.create_version( + agent_name=agent_name, + definition=PromptAgentDefinition( + model=model_deployment, + instructions=AGENT_INSTRUCTIONS, + ), + ) + print( + f"Agent created (id: {created_agent.id}, name: {created_agent.name}, " + f"version: {created_agent.version})." + ) + + # -------------------------------------------------------------- + # 0c. Seed traces by running a few conversations against the agent. + # -------------------------------------------------------------- + seed_start = datetime.now(tz=timezone.utc) + print( + f"Seed {trace_seeding_conversations} conversation(s) x " + f"{trace_seeding_turns} turn(s) against the agent so spans flow to Application Insights." + ) + _seed_agent_traces( + project_client=project_client, + agent_name_to_use=created_agent.name, + agent_id_to_use=created_agent.id, + conversation_count=trace_seeding_conversations, + turns_per_conversation=trace_seeding_turns, + conversation_ids=conversation_ids, + ) + + # Flush any buffered spans so the only delay we wait for below is + # ingestion delay, not exporter batching delay. + try: + from opentelemetry import trace as _otel_trace # pylint: disable=import-outside-toplevel + + tracer_provider = _otel_trace.get_tracer_provider() + force_flush = getattr(tracer_provider, "force_flush", None) + if callable(force_flush): + force_flush() + except Exception as exc: # pylint: disable=broad-exception-caught + print(f" (warning) could not force-flush tracer provider: {exc}") + + print( + f"Wait {trace_ingestion_wait_seconds}s for Application Insights to ingest the " + f"emitted spans. Override with TRACE_INGESTION_WAIT_SECONDS.", + flush=True, + ) + time.sleep(trace_ingestion_wait_seconds) + + # ------------------------------------------------------------------ + # 1. Submit a data generation job that reads agent traces. + # ------------------------------------------------------------------ + if seed_traces and seed_start is not None: + # Window covers a small backoff before seeding through "now", which + # guarantees the seeded spans fall inside the queried window. + start_time = seed_start - timedelta(minutes=5) + end_time = datetime.now(tz=timezone.utc) + else: + # BYO mode: use the user-configurable look-back window. + end_time = datetime.now(tz=timezone.utc) + start_time = end_time - timedelta(days=traces_window_days) + + print( + f"Create a data generation job from traces for agent `{agent_name}` " + f"(window: {start_time.isoformat()} .. {end_time.isoformat()})." + ) + job = DataGenerationJob( + inputs=DataGenerationJobInputs( + name=f"traces-eval-{run_id}", + scenario=DataGenerationJobScenario.EVALUATION, + sources=[ + TracesDataGenerationJobSource( + description="Application Insights conversation traces for the Foundry agent.", + agent_name=agent_name, + start_time=start_time, + end_time=end_time, + ), + ], + options=TracesDataGenerationJobOptions( + # Service requires max_samples to be between 15 and 1000. + max_samples=15, ), - ], - options=TracesDataGenerationJobOptions( - # Service requires max_samples to be between 15 and 1000. - max_samples=15, + output_options=DataGenerationJobOutputOptions(name=output_dataset_name), ), - output_options=DataGenerationJobOutputOptions(name=output_dataset_name), - ), - ) - job = project_client.beta.datasets.create_generation_job(job=job) - print(f"Created data generation job `{job.id}` (status: `{job.status}`).") - - print(f"Poll job `{job.id}` until it reaches a terminal state.", end="", flush=True) - while True: - job = project_client.beta.datasets.get_generation_job(job_id=job.id) - if job.status in TERMINAL_STATUSES: - break - time.sleep(poll_interval_seconds) - print(".", end="", flush=True) - print() - print(f"Final job status: `{job.status}`.") - - if job.status != JobStatus.SUCCEEDED: - message = job.error.message if job.error is not None else "" - raise RuntimeError(f"Job `{job.id}` ended with status `{job.status}`: {message}") - - # Locate the Dataset output produced by the job. - output_name: str = "" - output_version: str = "" - for output in (job.result.outputs if job.result is not None else None) or []: - if isinstance(output, DatasetDataGenerationJobOutput): - output_name = output.name or "" - output_version = output.version or "" - break - if not output_name or not output_version: - raise RuntimeError(f"Job `{job.id}` did not produce a dataset output.") - - dataset: DatasetVersion = project_client.datasets.get(name=output_name, version=output_version) - print(f"Generated dataset: name=`{dataset.name}` version=`{dataset.version}` id=`{dataset.id}`") - if job.result is not None and job.result.generated_samples is not None: - print(f"Generated samples: {job.result.generated_samples}") - - # ------------------------------------------------------------------ - # 2. Clean up. - # ------------------------------------------------------------------ - print(f"Delete the generated dataset `{dataset.name}` v{dataset.version}.") - project_client.datasets.delete(name=dataset.name or "", version=dataset.version or "") - - print(f"Delete the data generation job `{job.id}`.") - project_client.beta.datasets.delete_generation_job(job_id=job.id) + ) + job = project_client.beta.datasets.create_generation_job(job=job) + print(f"Created data generation job `{job.id}` (status: `{job.status}`).") + + print(f"Poll job `{job.id}` until it reaches a terminal state.", end="", flush=True) + while True: + job = project_client.beta.datasets.get_generation_job(job_id=job.id) + if job.status in TERMINAL_STATUSES: + break + time.sleep(poll_interval_seconds) + print(".", end="", flush=True) + print() + print(f"Final job status: `{job.status}`.") + + if job.status != JobStatus.SUCCEEDED: + message = job.error.message if job.error is not None else "" + raise RuntimeError(f"Job `{job.id}` ended with status `{job.status}`: {message}") + + # Locate the Dataset output produced by the job. + output_name: str = "" + output_version: str = "" + for output in (job.result.outputs if job.result is not None else None) or []: + if isinstance(output, DatasetDataGenerationJobOutput): + output_name = output.name or "" + output_version = output.version or "" + break + if not output_name or not output_version: + raise RuntimeError(f"Job `{job.id}` did not produce a dataset output.") + + dataset: DatasetVersion = project_client.datasets.get(name=output_name, version=output_version) + print(f"Generated dataset: name=`{dataset.name}` version=`{dataset.version}` id=`{dataset.id}`") + if job.result is not None and job.result.generated_samples is not None: + print(f"Generated samples: {job.result.generated_samples}") + + # ------------------------------------------------------------------ + # 2. Clean up dataset + job. + # ------------------------------------------------------------------ + print(f"Delete the generated dataset `{dataset.name}` v{dataset.version}.") + project_client.datasets.delete(name=dataset.name or "", version=dataset.version or "") + + print(f"Delete the data generation job `{job.id}`.") + project_client.beta.datasets.delete_generation_job(job_id=job.id) + + finally: + # Best-effort cleanup of the temporary agent and seeded conversations. + # Wrap each step so a failure in one does not skip the others, and so + # cleanup never masks the real exception that brought us here. + if conversation_ids: + try: + with project_client.get_openai_client() as openai_client: + for cid in conversation_ids: + try: + openai_client.conversations.delete(conversation_id=cid) + print(f"Deleted seeded conversation `{cid}`.") + except Exception as exc: # pylint: disable=broad-exception-caught + print(f" (warning) could not delete conversation `{cid}`: {exc}") + except Exception as exc: # pylint: disable=broad-exception-caught + print(f" (warning) could not open OpenAI client for conversation cleanup: {exc}") + + if created_agent is not None: + try: + project_client.agents.delete_version( + agent_name=created_agent.name, agent_version=created_agent.version + ) + print(f"Deleted temporary agent `{created_agent.name}` v{created_agent.version}.") + except Exception as exc: # pylint: disable=broad-exception-caught + print(f" (warning) could not delete temporary agent `{created_agent.name}`: {exc}") From 6819189524143da6c9265437bfb7031e8f595003 Mon Sep 17 00:00:00 2001 From: aprilkim Date: Sat, 30 May 2026 21:09:51 -0700 Subject: [PATCH 2/8] [ai/azure-ai-projects] move dataset+job cleanup into finally (review) Track submitted_job_id and created_dataset before the try block and move the dataset and job deletes into finally, alongside the existing conversation and agent cleanup. Previously these two deletes lived inside try, so a polling failure, dataset-get failure, or any exception between job creation and the success-path deletes would leak the data-generation job (and possibly the dataset) on the unhappy path. Each step is now wrapped in its own best-effort try/except so a failure in one does not skip the others, and so cleanup never masks the real exception. Live-tested happy path against build26-bug-bash on gpt-5.1: 15 generated samples, all 5 resources cleaned up via finally, exit 0. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ...et_generation_job_traces_for_evaluation.py | 41 ++++++++++++++----- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py index e7c60468b4c7..1374f29dd62d 100644 --- a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py +++ b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py @@ -286,6 +286,8 @@ def _seed_agent_traces( created_agent = None conversation_ids: List[str] = [] seed_start: Optional[datetime] = None + submitted_job_id: Optional[str] = None + created_dataset: Optional[DatasetVersion] = None try: if seed_traces: @@ -410,6 +412,7 @@ def _seed_agent_traces( ), ) job = project_client.beta.datasets.create_generation_job(job=job) + submitted_job_id = job.id print(f"Created data generation job `{job.id}` (status: `{job.status}`).") print(f"Poll job `{job.id}` until it reaches a terminal state.", end="", flush=True) @@ -438,23 +441,39 @@ def _seed_agent_traces( raise RuntimeError(f"Job `{job.id}` did not produce a dataset output.") dataset: DatasetVersion = project_client.datasets.get(name=output_name, version=output_version) + created_dataset = dataset print(f"Generated dataset: name=`{dataset.name}` version=`{dataset.version}` id=`{dataset.id}`") if job.result is not None and job.result.generated_samples is not None: print(f"Generated samples: {job.result.generated_samples}") - # ------------------------------------------------------------------ - # 2. Clean up dataset + job. - # ------------------------------------------------------------------ - print(f"Delete the generated dataset `{dataset.name}` v{dataset.version}.") - project_client.datasets.delete(name=dataset.name or "", version=dataset.version or "") + finally: + # Best-effort cleanup. Each step is wrapped in its own try/except so a + # failure in one does not skip the others, and so cleanup never masks + # the real exception that brought us here. Order is outputs -> producers: + # dataset -> job -> seeded conversations -> temporary agent. + if created_dataset is not None: + try: + print( + f"Delete the generated dataset `{created_dataset.name}` v{created_dataset.version}." + ) + project_client.datasets.delete( + name=created_dataset.name or "", version=created_dataset.version or "" + ) + except Exception as exc: # pylint: disable=broad-exception-caught + print( + f" (warning) could not delete generated dataset " + f"`{created_dataset.name}` v{created_dataset.version}: {exc}" + ) - print(f"Delete the data generation job `{job.id}`.") - project_client.beta.datasets.delete_generation_job(job_id=job.id) + if submitted_job_id is not None: + try: + print(f"Delete the data generation job `{submitted_job_id}`.") + project_client.beta.datasets.delete_generation_job(job_id=submitted_job_id) + except Exception as exc: # pylint: disable=broad-exception-caught + print( + f" (warning) could not delete data generation job `{submitted_job_id}`: {exc}" + ) - finally: - # Best-effort cleanup of the temporary agent and seeded conversations. - # Wrap each step so a failure in one does not skip the others, and so - # cleanup never masks the real exception that brought us here. if conversation_ids: try: with project_client.get_openai_client() as openai_client: From ec425c480c43575aaf7cfd6c83258eda2f17b40f Mon Sep 17 00:00:00 2001 From: aprilkim Date: Sat, 30 May 2026 21:39:18 -0700 Subject: [PATCH 3/8] [ai/azure-ai-projects] simplify traces-for-evaluation sample (review) Drops the bring-your-own-agent mode and trims supporting plumbing so the sample is easier for a first-time reader to follow. Was 439 lines; now 282 (sister sample sample_dataset_generation_job_simpleqna_with_agent_source.py is 180). Changes: - Self-contained mode only. The if seed_traces / else split, the BYO env vars (FOUNDRY_AGENT_NAME, FOUNDRY_TRACES_WINDOW_DAYS, TRACE_SEEDING_CONVERSATIONS, TRACE_SEEDING_TURNS), and the importlib.import_module dance for optional telemetry deps are gone. A 4-line note in the docstring tells BYO users which block to replace. - Imports azure.monitor.opentelemetry and azure.ai.projects.telemetry directly at the top of the file. - Shrinks AGENT_INSTRUCTIONS from ~40 lines to ~15 with only the policies the seeded prompts actually ask about. - Drops the _safe_console helper and the per-turn preview prints. Cleanup output is still printed. - Drops the opentelemetry force_flush try/except; the 180s ingestion wait covers exporter batching too. - Replaces the four per-resource try/except cleanup blocks with a small _try_delete(label, fn, *args, **kwargs) helper. Live-tested happy path against build26-bug-bash on gpt-5.1: 15 generated samples, all 6 resources cleaned up via finally, exit 0. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ...et_generation_job_traces_for_evaluation.py | 552 ++++++------------ 1 file changed, 191 insertions(+), 361 deletions(-) diff --git a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py index 1374f29dd62d..8b5f06fd59b0 100644 --- a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py +++ b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py @@ -6,33 +6,26 @@ """ DESCRIPTION: - Generates an evaluation dataset from an agent's recent conversation - traces. The sample runs in one of two modes: - - * Self-contained mode (default): Creates a temporary Foundry agent, - runs a few sample conversations against it with GenAI content - tracing enabled so spans flow to Application Insights, waits for - ingestion, then runs the data generation job. The temporary agent - and conversations are deleted at the end. Use this mode to try the - sample without preparing anything in advance. - * Bring-your-own-agent mode (BYO): Set FOUNDRY_AGENT_NAME to point at - an existing agent that already has recent conversation traces. The - sample skips agent creation and trace seeding and uses your agent - as-is. - - In both modes, the sample: - 1. Creates a `DataGenerationJob` (scenario=EVALUATION, type=traces) - that reads spans from Application Insights for the agent within a - time window and synthesizes question / answer pairs into a new - versioned Dataset. - 2. Polls the job to completion and resolves the resulting - `DatasetVersion`. - 3. Cleans up the generated dataset, the data generation job, and - (in self-contained mode) the temporary agent and conversations. - - The Traces source consumes existing telemetry, so no `model_options` - are required — the service derives samples directly from the agent's - traces. + Generates an evaluation dataset from an agent's conversation traces. + The sample is fully self-contained: + + 1. Wires up Azure Monitor + the AIProjectInstrumentor so the temporary + agent's calls emit semantic GenAI spans (with message content) to + Application Insights. + 2. Creates a temporary Foundry agent and runs a few sample + conversations against it so spans flow to Application Insights. + 3. Waits for ingestion, then submits a `DataGenerationJob` + (scenario=EVALUATION, source=traces) that synthesizes question/ + answer pairs from those spans. + 4. Polls the job, fetches the resulting `DatasetVersion`, and prints + the count of generated samples. + 5. Cleans up the dataset, job, seeded conversations, and the + temporary agent. + + To run against an existing agent that already has recent traces in + Application Insights, replace the seeding block (step 2) with your + agent's name and skip the ingestion wait. The data-generation API call + (step 3) is the same. USAGE: python sample_dataset_generation_job_traces_for_evaluation.py @@ -42,44 +35,27 @@ pip install "azure-ai-projects>=2.2.0" azure-identity python-dotenv \\ azure-monitor-opentelemetry azure-core-tracing-opentelemetry - (The two telemetry packages are only required for self-contained mode.) - Set these environment variables with your own values: 1) FOUNDRY_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as found in the overview page of your Microsoft Foundry project. - 2) FOUNDRY_AGENT_NAME - Optional. The name of an existing agent (Foundry - Agent or OpenTelemetry-instrumented third-party agent) that already - has recent conversation traces in Application Insights. If set, the - sample skips agent creation and trace seeding and uses this agent. - 3) FOUNDRY_MODEL_NAME - Required for self-contained mode. The Azure OpenAI - deployment name used to drive the temporary agent during trace - seeding. Ignored when FOUNDRY_AGENT_NAME is set. - 4) DATASET_NAME - Optional. Name to assign to the generated output + 2) FOUNDRY_MODEL_NAME - Required. The Azure OpenAI deployment name used + to drive the temporary agent during trace seeding. + 3) DATASET_NAME - Optional. Name to assign to the generated output dataset. Defaults to `traces-eval-sample`. The service caps the - rendered output name at 50 characters, so keep custom values short — + rendered output name at 50 characters, so keep custom values short - the sample appends a unique run id suffix. - 5) FOUNDRY_TRACES_WINDOW_DAYS - Optional. How far back, in days, to look - for agent traces when in BYO mode. Defaults to 7. Ignored in - self-contained mode (the sample uses an exact window covering the - seeded traces). - 6) POLL_INTERVAL_SECONDS - Optional. Number of seconds to sleep between - status polls for the data generation job. Defaults to 10. - 7) TRACE_SEEDING_CONVERSATIONS - Optional. Number of conversations to - seed in self-contained mode. Defaults to 3. - 8) TRACE_SEEDING_TURNS - Optional. Turns per seeded conversation in - self-contained mode. Defaults to 5. - 9) TRACE_INGESTION_WAIT_SECONDS - Optional. Seconds to wait after seeding - for Application Insights to ingest the emitted spans before + 4) POLL_INTERVAL_SECONDS - Optional. Seconds to sleep between status + polls for the data generation job. Defaults to 10. + 5) TRACE_INGESTION_WAIT_SECONDS - Optional. Seconds to wait after + seeding for Application Insights to ingest the emitted spans before submitting the data generation job. Defaults to 180. """ -import importlib import os -import sys import time import uuid from datetime import datetime, timedelta, timezone -from typing import List, Optional +from typing import Callable, List, Optional from dotenv import load_dotenv @@ -97,186 +73,89 @@ TracesDataGenerationJobOptions, TracesDataGenerationJobSource, ) +from azure.ai.projects.telemetry import AIProjectInstrumentor +from azure.monitor.opentelemetry import configure_azure_monitor load_dotenv() -# Persona used when seeding traces in self-contained mode. Mirrors the -# Widgets & Gizmos persona from -# sample_dataset_generation_job_simpleqna_with_agent_source.py so the -# generated traces have substantive multi-turn content the data generation -# service can synthesize useful eval samples from. + +# Short persona used to make seeded traces look like real customer-support +# conversations. The data-gen service synthesizes eval samples from these +# traces, so the persona just needs enough domain detail to answer the +# seeding prompts confidently. AGENT_INSTRUCTIONS = """\ -You are the Widgets & Gizmos customer-support agent. Help customers with -returns, warranty claims, repairs, product specifications, compatibility, -and ordering for Widgets, Gizmos, Sprockets, and accessories. - -Use this knowledge base when answering. Cite the relevant policy or spec -directly when you can. - -Returns - * Unopened products may be returned within 30 days for a full refund. - * Opened products may be returned within 14 days for a refund minus a - 10% restocking fee. Defective products may be returned within 90 days - at no cost. - * Refunds are processed within 5-7 business days after the return is - received and inspected. - * Items lost in shipping should be reported within 21 days of the order - date; we re-ship at no cost. - -Warranty - * Standard products carry a 1-year limited warranty against - manufacturing defects. - * The Deluxe Sprocket carries a 5-year limited warranty. - * Warranty repairs are free. Customer ships the unit to us prepaid; we - cover return shipping. Typical turnaround is 10-14 business days. - -Specifications - * Standard Widget: 4 inches, blue or red, weighs 6oz, made of aluminum. - * Compact Widget: 2 inches, gray only, weighs 3oz, made of aluminum. - * Gizmo: 6 inches, available in green, weighs 10oz, made of stainless - steel and ABS plastic. Compatible with all Sprocket Adapter v2 mounts. - * Sprocket Adapter v2: universal mount that fits Widgets, Gizmos, and - third-party 1/4-20 hardware. - -Pricing & bundles - * Standard Widget: $19.99 each, bundle of 10 for $149.99. - * Gizmo: $34.99 each, bundle of 5 for $129.99. - * Deluxe Sprocket: $79.99 each. - -If you do not know the answer, say so and offer to escalate. Be concise. +You are the Widgets & Gizmos customer-support agent. + +Returns: Unopened products may be returned within 30 days for a full refund. +Defective products may be returned within 90 days at no cost. Refunds take +5-7 business days. + +Warranty: Standard products carry a 1-year limited warranty. The Deluxe +Sprocket carries a 5-year warranty. Warranty repairs are free; we cover +return shipping. Repairs take 10-14 business days. + +Products: Standard Widget is $19.99 (bundle of 10 for $149.99). Deluxe +Sprocket is $79.99. + +If you do not know the answer, say so. Be concise. """ -# Multi-turn conversation arcs used to seed traces. Each inner list is one -# conversation; the sample runs each turn against the temporary agent. -SEEDING_CONVERSATION_ARCS = [ + +SEEDING_CONVERSATIONS: List[List[str]] = [ [ - "Hi, I need to return a defective Standard Widget.", - "I bought it 45 days ago. Is it still eligible for a refund?", - "What about a Gizmo I ordered but never received - it has been 3 weeks?", - "Can I get a refund instead of a replacement shipment?", - "How long will the refund take to show up on my card?", + "Can I return a defective Standard Widget after 45 days?", + "How long does a refund take?", + "What about an unopened Standard Widget?", + "Do I pay return shipping?", + "Is there a restocking fee?", ], [ - "Does the Deluxe Sprocket come with a warranty?", - "What exactly does the warranty cover?", - "My Deluxe Sprocket stopped turning after 6 months - what should I do?", - "Do I have to pay for return shipping on a warranty claim?", - "How long do warranty repairs usually take?", + "What is the warranty on the Deluxe Sprocket?", + "What does the warranty cover?", + "Do warranty repairs cost anything?", + "How long do warranty repairs take?", + "Who pays return shipping for a warranty claim?", ], [ - "What is the difference between a Standard Widget and a Compact Widget?", - "Is the Compact Widget compatible with the Sprocket Adapter v2?", - "What colors and sizes are Gizmos available in?", - "How much is a bundle of 10 Standard Widgets?", - "Do you carry any third-party accessories that fit the Sprocket Adapter v2?", + "How much is a Standard Widget?", + "Is there a bundle deal?", + "What is the Deluxe Sprocket price?", + "What products do you carry?", + "Do you sell accessories?", ], ] + endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"] -provided_agent_name = os.environ.get("FOUNDRY_AGENT_NAME", "").strip() +model_deployment = os.environ["FOUNDRY_MODEL_NAME"] dataset_name = os.environ.get("DATASET_NAME", "traces-eval-sample") poll_interval_seconds = int(os.environ.get("POLL_INTERVAL_SECONDS", "10")) - -# Self-contained mode is enabled unless the user pointed at an existing agent. -seed_traces = not provided_agent_name - -# Window default differs by mode: in self-contained mode we compute the -# window exactly around the seeded traces (so this knob is ignored). -traces_window_days = int(os.environ.get("FOUNDRY_TRACES_WINDOW_DAYS", "7")) - -# Seeding knobs (only used when seed_traces is True). -trace_seeding_conversations = int( - os.environ.get("TRACE_SEEDING_CONVERSATIONS", str(len(SEEDING_CONVERSATION_ARCS))) -) -trace_seeding_turns = int( - os.environ.get("TRACE_SEEDING_TURNS", str(len(SEEDING_CONVERSATION_ARCS[0]))) -) trace_ingestion_wait_seconds = int(os.environ.get("TRACE_INGESTION_WAIT_SECONDS", "180")) -if seed_traces and "FOUNDRY_MODEL_NAME" not in os.environ: - raise EnvironmentError( - "Self-contained mode requires FOUNDRY_MODEL_NAME (the Azure OpenAI deployment " - "name used to drive the temporary agent). Either set FOUNDRY_MODEL_NAME or set " - "FOUNDRY_AGENT_NAME to use an existing agent with traces." - ) - -# Unique per-run output dataset name so repeated runs do not collide. -# Output names are capped at 50 characters by the service. +# Unique per-run id used for the output dataset name and the temporary +# agent name so repeated runs do not collide and so any matched traces +# clearly belong to this run. Output names are capped at 50 chars. run_id = f"{datetime.now(tz=timezone.utc).strftime('%y%m%d%H%M%S')}-{uuid.uuid4().hex[:4]}" output_dataset_name = f"{dataset_name}-{run_id}" if len(output_dataset_name) > 50: raise ValueError( f"Output dataset name `{output_dataset_name}` exceeds the 50-character service limit. " - f"Lower DATASET_NAME (currently `{dataset_name}`) so that `-` fits within 50 characters." + f"Shorten DATASET_NAME (currently `{dataset_name}`) so that `-` fits within 50 characters." ) -# Agent name used to read traces. In self-contained mode we use a unique -# per-run name so concurrent runs do not collide and so we know any matched -# traces belong to this run. -agent_name = provided_agent_name or f"traces-eval-sample-{run_id}" +agent_name = f"traces-eval-sample-{run_id}" TERMINAL_STATUSES = {JobStatus.SUCCEEDED, JobStatus.FAILED, JobStatus.CANCELLED} -def _safe_console(text: str) -> str: - """Encode `text` so it always prints on the active stdout encoding. - - Some Windows consoles default to cp1252, which cannot encode characters - the model may emit (e.g. smart quotes, non-breaking hyphens). We replace - any unencodable code points with `?` so a preview line never crashes the - sample. - """ - encoding = getattr(sys.stdout, "encoding", None) or "utf-8" - return text.encode(encoding, errors="replace").decode(encoding, errors="replace") - - -def _seed_agent_traces( - project_client: AIProjectClient, - agent_name_to_use: str, - agent_id_to_use: str, - conversation_count: int, - turns_per_conversation: int, - conversation_ids: List[str], -) -> None: - """Run a few conversations against the agent so GenAI spans flow to App Insights. - - Created conversation IDs are appended to `conversation_ids` as each - conversation is created, so the caller can clean them up even if seeding - raises mid-way through. - """ - arcs = SEEDING_CONVERSATION_ARCS - with project_client.get_openai_client() as openai_client: - for ci in range(conversation_count): - arc = arcs[ci % len(arcs)] - conversation = openai_client.conversations.create() - conversation_ids.append(conversation.id) - print(f" - conversation {ci + 1}/{conversation_count} (id: {conversation.id})") - for ti in range(turns_per_conversation): - prompt = arc[ti % len(arc)] - response = openai_client.responses.create( - conversation=conversation.id, - input=prompt, - extra_body={ - "agent_reference": { - "name": agent_name_to_use, - "id": agent_id_to_use, - "type": "agent_reference", - } - }, - ) - preview = (response.output_text or "").replace("\n", " ") - if len(preview) > 80: - preview = preview[:77] + "..." - print(_safe_console(f" turn {ti + 1}: {prompt}")) - print(_safe_console(f" response: {preview}")) - - -mode_label = ( - "self-contained (will create a temporary agent and seed traces)" - if seed_traces - else f"bring-your-own-agent (`{provided_agent_name}`)" -) -print(f"Mode: {mode_label}.") +def _try_delete(label: str, fn: Callable[..., object], *args: object, **kwargs: object) -> None: + """Best-effort delete; logs and swallows failures so later cleanup steps still run.""" + try: + fn(*args, **kwargs) + print(f"Deleted {label}.") + except Exception as exc: # pylint: disable=broad-exception-caught + print(f" (warning) could not delete {label}: {exc}") + with ( DefaultAzureCredential() as credential, @@ -285,143 +164,103 @@ def _seed_agent_traces( created_agent = None conversation_ids: List[str] = [] - seed_start: Optional[datetime] = None submitted_job_id: Optional[str] = None created_dataset: Optional[DatasetVersion] = None try: - if seed_traces: - # -------------------------------------------------------------- - # 0a. Wire up Azure Monitor + GenAI instrumentation so calls to - # responses.create emit semantic GenAI spans (with message - # content) to Application Insights. - # -------------------------------------------------------------- - try: - configure_azure_monitor = importlib.import_module( - "azure.monitor.opentelemetry" - ).configure_azure_monitor - AIProjectInstrumentor = importlib.import_module( - "azure.ai.projects.telemetry" - ).AIProjectInstrumentor - except ImportError as exc: - raise ImportError( - "Self-contained mode requires the `azure-monitor-opentelemetry` and " - "`azure-core-tracing-opentelemetry` packages. Install them with " - "`pip install azure-monitor-opentelemetry azure-core-tracing-opentelemetry` " - "or set FOUNDRY_AGENT_NAME to use an existing agent with traces." - ) from exc - - # AIProjectInstrumentor requires this env var be set BEFORE - # instrument() is called. We force it on (not setdefault) so the - # temporary agent's calls always produce GenAI spans the data-gen - # service can read. - os.environ["AZURE_EXPERIMENTAL_ENABLE_GENAI_TRACING"] = "true" - - print("Fetch Application Insights connection string and configure Azure Monitor exporter.") - connection_string = project_client.telemetry.get_application_insights_connection_string() - configure_azure_monitor(connection_string=connection_string) - AIProjectInstrumentor().instrument(enable_content_recording=True) - - # -------------------------------------------------------------- - # 0b. Create the temporary agent. - # -------------------------------------------------------------- - model_deployment = os.environ["FOUNDRY_MODEL_NAME"] - print(f"Create temporary agent `{agent_name}` (model: `{model_deployment}`).") - created_agent = project_client.agents.create_version( - agent_name=agent_name, - definition=PromptAgentDefinition( - model=model_deployment, - instructions=AGENT_INSTRUCTIONS, - ), - ) - print( - f"Agent created (id: {created_agent.id}, name: {created_agent.name}, " - f"version: {created_agent.version})." - ) + # ------------------------------------------------------------------ + # 1. Configure Azure Monitor + GenAI instrumentation so the + # temporary agent's calls emit semantic GenAI spans (with + # message content) to Application Insights. + # ------------------------------------------------------------------ + # AIProjectInstrumentor reads this env var at instrument() time. + os.environ["AZURE_EXPERIMENTAL_ENABLE_GENAI_TRACING"] = "true" - # -------------------------------------------------------------- - # 0c. Seed traces by running a few conversations against the agent. - # -------------------------------------------------------------- - seed_start = datetime.now(tz=timezone.utc) - print( - f"Seed {trace_seeding_conversations} conversation(s) x " - f"{trace_seeding_turns} turn(s) against the agent so spans flow to Application Insights." - ) - _seed_agent_traces( - project_client=project_client, - agent_name_to_use=created_agent.name, - agent_id_to_use=created_agent.id, - conversation_count=trace_seeding_conversations, - turns_per_conversation=trace_seeding_turns, - conversation_ids=conversation_ids, - ) + print("Configure Azure Monitor exporter from the project's Application Insights connection.") + connection_string = project_client.telemetry.get_application_insights_connection_string() + configure_azure_monitor(connection_string=connection_string) + AIProjectInstrumentor().instrument(enable_content_recording=True) - # Flush any buffered spans so the only delay we wait for below is - # ingestion delay, not exporter batching delay. - try: - from opentelemetry import trace as _otel_trace # pylint: disable=import-outside-toplevel + # ------------------------------------------------------------------ + # 2. Create a temporary agent and seed traces by running a few + # conversations against it. + # ------------------------------------------------------------------ + print(f"Create temporary agent `{agent_name}` (model: `{model_deployment}`).") + created_agent = project_client.agents.create_version( + agent_name=agent_name, + definition=PromptAgentDefinition(model=model_deployment, instructions=AGENT_INSTRUCTIONS), + ) + print(f"Agent created (id: {created_agent.id}, version: {created_agent.version}).") - tracer_provider = _otel_trace.get_tracer_provider() - force_flush = getattr(tracer_provider, "force_flush", None) - if callable(force_flush): - force_flush() - except Exception as exc: # pylint: disable=broad-exception-caught - print(f" (warning) could not force-flush tracer provider: {exc}") + seed_start = datetime.now(tz=timezone.utc) + print( + f"Seed {len(SEEDING_CONVERSATIONS)} conversation(s) x " + f"{len(SEEDING_CONVERSATIONS[0])} turn(s) against the agent." + ) + with project_client.get_openai_client() as openai_client: + for ci, arc in enumerate(SEEDING_CONVERSATIONS, start=1): + conversation = openai_client.conversations.create() + conversation_ids.append(conversation.id) + print(f" - conversation {ci}/{len(SEEDING_CONVERSATIONS)} (id: {conversation.id})") + for prompt in arc: + openai_client.responses.create( + conversation=conversation.id, + input=prompt, + extra_body={ + "agent_reference": { + "name": created_agent.name, + "id": created_agent.id, + "type": "agent_reference", + } + }, + ) - print( - f"Wait {trace_ingestion_wait_seconds}s for Application Insights to ingest the " - f"emitted spans. Override with TRACE_INGESTION_WAIT_SECONDS.", - flush=True, - ) - time.sleep(trace_ingestion_wait_seconds) + print( + f"Wait {trace_ingestion_wait_seconds}s for Application Insights to ingest the emitted spans. " + f"Override with TRACE_INGESTION_WAIT_SECONDS.", + flush=True, + ) + time.sleep(trace_ingestion_wait_seconds) # ------------------------------------------------------------------ - # 1. Submit a data generation job that reads agent traces. + # 3. Submit a data generation job that reads the agent's traces. # ------------------------------------------------------------------ - if seed_traces and seed_start is not None: - # Window covers a small backoff before seeding through "now", which - # guarantees the seeded spans fall inside the queried window. - start_time = seed_start - timedelta(minutes=5) - end_time = datetime.now(tz=timezone.utc) - else: - # BYO mode: use the user-configurable look-back window. - end_time = datetime.now(tz=timezone.utc) - start_time = end_time - timedelta(days=traces_window_days) + # Cover a small backoff before seeding through "now" so the seeded + # spans definitely fall inside the queried window. + start_time = seed_start - timedelta(minutes=5) + end_time = datetime.now(tz=timezone.utc) print( f"Create a data generation job from traces for agent `{agent_name}` " f"(window: {start_time.isoformat()} .. {end_time.isoformat()})." ) - job = DataGenerationJob( - inputs=DataGenerationJobInputs( - name=f"traces-eval-{run_id}", - scenario=DataGenerationJobScenario.EVALUATION, - sources=[ - TracesDataGenerationJobSource( - description="Application Insights conversation traces for the Foundry agent.", - agent_name=agent_name, - start_time=start_time, - end_time=end_time, - ), - ], - options=TracesDataGenerationJobOptions( + job = project_client.beta.datasets.create_generation_job( + job=DataGenerationJob( + inputs=DataGenerationJobInputs( + name=f"traces-eval-{run_id}", + scenario=DataGenerationJobScenario.EVALUATION, + sources=[ + TracesDataGenerationJobSource( + description="Application Insights conversation traces for the temporary agent.", + agent_name=agent_name, + start_time=start_time, + end_time=end_time, + ), + ], # Service requires max_samples to be between 15 and 1000. - max_samples=15, + options=TracesDataGenerationJobOptions(max_samples=15), + output_options=DataGenerationJobOutputOptions(name=output_dataset_name), ), - output_options=DataGenerationJobOutputOptions(name=output_dataset_name), ), ) - job = project_client.beta.datasets.create_generation_job(job=job) submitted_job_id = job.id print(f"Created data generation job `{job.id}` (status: `{job.status}`).") print(f"Poll job `{job.id}` until it reaches a terminal state.", end="", flush=True) - while True: - job = project_client.beta.datasets.get_generation_job(job_id=job.id) - if job.status in TERMINAL_STATUSES: - break + while job.status not in TERMINAL_STATUSES: time.sleep(poll_interval_seconds) print(".", end="", flush=True) + job = project_client.beta.datasets.get_generation_job(job_id=job.id) print() print(f"Final job status: `{job.status}`.") @@ -429,68 +268,59 @@ def _seed_agent_traces( message = job.error.message if job.error is not None else "" raise RuntimeError(f"Job `{job.id}` ended with status `{job.status}`: {message}") - # Locate the Dataset output produced by the job. - output_name: str = "" - output_version: str = "" - for output in (job.result.outputs if job.result is not None else None) or []: - if isinstance(output, DatasetDataGenerationJobOutput): - output_name = output.name or "" - output_version = output.version or "" - break - if not output_name or not output_version: + # ------------------------------------------------------------------ + # 4. Resolve the generated dataset. + # ------------------------------------------------------------------ + outputs = (job.result.outputs if job.result is not None else None) or [] + dataset_output = next( + (o for o in outputs if isinstance(o, DatasetDataGenerationJobOutput)), None + ) + if dataset_output is None or not dataset_output.name or not dataset_output.version: raise RuntimeError(f"Job `{job.id}` did not produce a dataset output.") - dataset: DatasetVersion = project_client.datasets.get(name=output_name, version=output_version) - created_dataset = dataset - print(f"Generated dataset: name=`{dataset.name}` version=`{dataset.version}` id=`{dataset.id}`") + created_dataset = project_client.datasets.get( + name=dataset_output.name, version=dataset_output.version + ) + print( + f"Generated dataset: name=`{created_dataset.name}` " + f"version=`{created_dataset.version}` id=`{created_dataset.id}`" + ) if job.result is not None and job.result.generated_samples is not None: print(f"Generated samples: {job.result.generated_samples}") finally: - # Best-effort cleanup. Each step is wrapped in its own try/except so a - # failure in one does not skip the others, and so cleanup never masks - # the real exception that brought us here. Order is outputs -> producers: - # dataset -> job -> seeded conversations -> temporary agent. + # Best-effort cleanup, outputs -> producers (dataset, job, conversations, agent). if created_dataset is not None: - try: - print( - f"Delete the generated dataset `{created_dataset.name}` v{created_dataset.version}." - ) - project_client.datasets.delete( - name=created_dataset.name or "", version=created_dataset.version or "" - ) - except Exception as exc: # pylint: disable=broad-exception-caught - print( - f" (warning) could not delete generated dataset " - f"`{created_dataset.name}` v{created_dataset.version}: {exc}" - ) + _try_delete( + f"generated dataset `{created_dataset.name}` v{created_dataset.version}", + project_client.datasets.delete, + name=created_dataset.name or "", + version=created_dataset.version or "", + ) if submitted_job_id is not None: - try: - print(f"Delete the data generation job `{submitted_job_id}`.") - project_client.beta.datasets.delete_generation_job(job_id=submitted_job_id) - except Exception as exc: # pylint: disable=broad-exception-caught - print( - f" (warning) could not delete data generation job `{submitted_job_id}`: {exc}" - ) + _try_delete( + f"data generation job `{submitted_job_id}`", + project_client.beta.datasets.delete_generation_job, + job_id=submitted_job_id, + ) if conversation_ids: try: with project_client.get_openai_client() as openai_client: for cid in conversation_ids: - try: - openai_client.conversations.delete(conversation_id=cid) - print(f"Deleted seeded conversation `{cid}`.") - except Exception as exc: # pylint: disable=broad-exception-caught - print(f" (warning) could not delete conversation `{cid}`: {exc}") + _try_delete( + f"seeded conversation `{cid}`", + openai_client.conversations.delete, + conversation_id=cid, + ) except Exception as exc: # pylint: disable=broad-exception-caught print(f" (warning) could not open OpenAI client for conversation cleanup: {exc}") if created_agent is not None: - try: - project_client.agents.delete_version( - agent_name=created_agent.name, agent_version=created_agent.version - ) - print(f"Deleted temporary agent `{created_agent.name}` v{created_agent.version}.") - except Exception as exc: # pylint: disable=broad-exception-caught - print(f" (warning) could not delete temporary agent `{created_agent.name}`: {exc}") + _try_delete( + f"temporary agent `{created_agent.name}` v{created_agent.version}", + project_client.agents.delete_version, + agent_name=created_agent.name, + agent_version=created_agent.version, + ) From 6a255e8440051b927725ccae3784effad24e048a Mon Sep 17 00:00:00 2001 From: aprilkim Date: Sat, 30 May 2026 21:51:11 -0700 Subject: [PATCH 4/8] Tighten comments in self-contained traces sample Trim verbose section banners and docstring prose; replace multi-line comments with single-line equivalents. No behavior change. 282 -> 257 lines (-9%). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ...et_generation_job_traces_for_evaluation.py | 73 ++++++------------- 1 file changed, 24 insertions(+), 49 deletions(-) diff --git a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py index 8b5f06fd59b0..41e798cc3844 100644 --- a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py +++ b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py @@ -9,23 +9,16 @@ Generates an evaluation dataset from an agent's conversation traces. The sample is fully self-contained: - 1. Wires up Azure Monitor + the AIProjectInstrumentor so the temporary - agent's calls emit semantic GenAI spans (with message content) to - Application Insights. - 2. Creates a temporary Foundry agent and runs a few sample - conversations against it so spans flow to Application Insights. + 1. Wires up Azure Monitor + AIProjectInstrumentor so agent calls emit + semantic GenAI spans (with content) to Application Insights. + 2. Creates a temporary agent and seeds spans with sample conversations. 3. Waits for ingestion, then submits a `DataGenerationJob` - (scenario=EVALUATION, source=traces) that synthesizes question/ - answer pairs from those spans. - 4. Polls the job, fetches the resulting `DatasetVersion`, and prints - the count of generated samples. - 5. Cleans up the dataset, job, seeded conversations, and the - temporary agent. - - To run against an existing agent that already has recent traces in - Application Insights, replace the seeding block (step 2) with your - agent's name and skip the ingestion wait. The data-generation API call - (step 3) is the same. + (scenario=EVALUATION, source=traces) that synthesizes Q/A pairs. + 4. Polls the job and fetches the resulting `DatasetVersion`. + 5. Cleans up the dataset, job, seeded conversations, and agent. + + To adapt for an existing agent with recent traces, replace step 2 with + your agent's name and skip the ingestion wait. USAGE: python sample_dataset_generation_job_traces_for_evaluation.py @@ -40,15 +33,13 @@ found in the overview page of your Microsoft Foundry project. 2) FOUNDRY_MODEL_NAME - Required. The Azure OpenAI deployment name used to drive the temporary agent during trace seeding. - 3) DATASET_NAME - Optional. Name to assign to the generated output - dataset. Defaults to `traces-eval-sample`. The service caps the - rendered output name at 50 characters, so keep custom values short - - the sample appends a unique run id suffix. - 4) POLL_INTERVAL_SECONDS - Optional. Seconds to sleep between status - polls for the data generation job. Defaults to 10. - 5) TRACE_INGESTION_WAIT_SECONDS - Optional. Seconds to wait after - seeding for Application Insights to ingest the emitted spans before - submitting the data generation job. Defaults to 180. + 3) DATASET_NAME - Optional. Output dataset name. Defaults to + `traces-eval-sample`. Service caps the rendered name at 50 chars + (the sample appends a unique run-id suffix). + 4) POLL_INTERVAL_SECONDS - Optional. Sleep between job status polls. + Defaults to 10. + 5) TRACE_INGESTION_WAIT_SECONDS - Optional. Wait after seeding for + Application Insights ingestion. Defaults to 180. """ import os @@ -79,10 +70,7 @@ load_dotenv() -# Short persona used to make seeded traces look like real customer-support -# conversations. The data-gen service synthesizes eval samples from these -# traces, so the persona just needs enough domain detail to answer the -# seeding prompts confidently. +# Short persona; covers only the topics the seeded prompts ask about. AGENT_INSTRUCTIONS = """\ You are the Widgets & Gizmos customer-support agent. @@ -132,9 +120,7 @@ poll_interval_seconds = int(os.environ.get("POLL_INTERVAL_SECONDS", "10")) trace_ingestion_wait_seconds = int(os.environ.get("TRACE_INGESTION_WAIT_SECONDS", "180")) -# Unique per-run id used for the output dataset name and the temporary -# agent name so repeated runs do not collide and so any matched traces -# clearly belong to this run. Output names are capped at 50 chars. +# Per-run id keeps repeated runs from colliding; output names are capped at 50 chars. run_id = f"{datetime.now(tz=timezone.utc).strftime('%y%m%d%H%M%S')}-{uuid.uuid4().hex[:4]}" output_dataset_name = f"{dataset_name}-{run_id}" if len(output_dataset_name) > 50: @@ -168,12 +154,9 @@ def _try_delete(label: str, fn: Callable[..., object], *args: object, **kwargs: created_dataset: Optional[DatasetVersion] = None try: - # ------------------------------------------------------------------ - # 1. Configure Azure Monitor + GenAI instrumentation so the - # temporary agent's calls emit semantic GenAI spans (with - # message content) to Application Insights. - # ------------------------------------------------------------------ - # AIProjectInstrumentor reads this env var at instrument() time. + # 1. Configure Azure Monitor + GenAI instrumentation to emit spans with content. + # AIProjectInstrumentor gates on this env var at instrument() time; without it + # instrument() returns early and no spans flow. os.environ["AZURE_EXPERIMENTAL_ENABLE_GENAI_TRACING"] = "true" print("Configure Azure Monitor exporter from the project's Application Insights connection.") @@ -181,10 +164,7 @@ def _try_delete(label: str, fn: Callable[..., object], *args: object, **kwargs: configure_azure_monitor(connection_string=connection_string) AIProjectInstrumentor().instrument(enable_content_recording=True) - # ------------------------------------------------------------------ - # 2. Create a temporary agent and seed traces by running a few - # conversations against it. - # ------------------------------------------------------------------ + # 2. Create a temporary agent and seed traces. print(f"Create temporary agent `{agent_name}` (model: `{model_deployment}`).") created_agent = project_client.agents.create_version( agent_name=agent_name, @@ -222,11 +202,8 @@ def _try_delete(label: str, fn: Callable[..., object], *args: object, **kwargs: ) time.sleep(trace_ingestion_wait_seconds) - # ------------------------------------------------------------------ # 3. Submit a data generation job that reads the agent's traces. - # ------------------------------------------------------------------ - # Cover a small backoff before seeding through "now" so the seeded - # spans definitely fall inside the queried window. + # Small backoff so the seeded spans fall inside the queried window. start_time = seed_start - timedelta(minutes=5) end_time = datetime.now(tz=timezone.utc) @@ -247,7 +224,7 @@ def _try_delete(label: str, fn: Callable[..., object], *args: object, **kwargs: end_time=end_time, ), ], - # Service requires max_samples to be between 15 and 1000. + # Service requires max_samples in [15, 1000]. options=TracesDataGenerationJobOptions(max_samples=15), output_options=DataGenerationJobOutputOptions(name=output_dataset_name), ), @@ -268,9 +245,7 @@ def _try_delete(label: str, fn: Callable[..., object], *args: object, **kwargs: message = job.error.message if job.error is not None else "" raise RuntimeError(f"Job `{job.id}` ended with status `{job.status}`: {message}") - # ------------------------------------------------------------------ # 4. Resolve the generated dataset. - # ------------------------------------------------------------------ outputs = (job.result.outputs if job.result is not None else None) or [] dataset_output = next( (o for o in outputs if isinstance(o, DatasetDataGenerationJobOutput)), None From e321c41a1e5749d710d79574ad2216ea10475860 Mon Sep 17 00:00:00 2001 From: aprilkim Date: Sat, 30 May 2026 21:54:04 -0700 Subject: [PATCH 5/8] Inline cleanup try/except in self-contained traces sample Replace the _try_delete helper with one inline try/except per resource. Each cleanup now reads top-to-bottom at the call site (preferred for sample readability) and drops a layer of indirection. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ...et_generation_job_traces_for_evaluation.py | 59 +++++++++---------- 1 file changed, 27 insertions(+), 32 deletions(-) diff --git a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py index 41e798cc3844..a10257e2662f 100644 --- a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py +++ b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py @@ -46,7 +46,7 @@ import time import uuid from datetime import datetime, timedelta, timezone -from typing import Callable, List, Optional +from typing import List, Optional from dotenv import load_dotenv @@ -134,15 +134,6 @@ TERMINAL_STATUSES = {JobStatus.SUCCEEDED, JobStatus.FAILED, JobStatus.CANCELLED} -def _try_delete(label: str, fn: Callable[..., object], *args: object, **kwargs: object) -> None: - """Best-effort delete; logs and swallows failures so later cleanup steps still run.""" - try: - fn(*args, **kwargs) - print(f"Deleted {label}.") - except Exception as exc: # pylint: disable=broad-exception-caught - print(f" (warning) could not delete {label}: {exc}") - - with ( DefaultAzureCredential() as credential, AIProjectClient(endpoint=endpoint, credential=credential) as project_client, @@ -266,36 +257,40 @@ def _try_delete(label: str, fn: Callable[..., object], *args: object, **kwargs: finally: # Best-effort cleanup, outputs -> producers (dataset, job, conversations, agent). if created_dataset is not None: - _try_delete( - f"generated dataset `{created_dataset.name}` v{created_dataset.version}", - project_client.datasets.delete, - name=created_dataset.name or "", - version=created_dataset.version or "", - ) + try: + project_client.datasets.delete( + name=created_dataset.name or "", + version=created_dataset.version or "", + ) + print(f"Deleted dataset `{created_dataset.name}` v{created_dataset.version}.") + except Exception as exc: # pylint: disable=broad-exception-caught + print(f" (warning) could not delete dataset: {exc}") if submitted_job_id is not None: - _try_delete( - f"data generation job `{submitted_job_id}`", - project_client.beta.datasets.delete_generation_job, - job_id=submitted_job_id, - ) + try: + project_client.beta.datasets.delete_generation_job(job_id=submitted_job_id) + print(f"Deleted data generation job `{submitted_job_id}`.") + except Exception as exc: # pylint: disable=broad-exception-caught + print(f" (warning) could not delete job: {exc}") if conversation_ids: try: with project_client.get_openai_client() as openai_client: for cid in conversation_ids: - _try_delete( - f"seeded conversation `{cid}`", - openai_client.conversations.delete, - conversation_id=cid, - ) + try: + openai_client.conversations.delete(conversation_id=cid) + print(f"Deleted seeded conversation `{cid}`.") + except Exception as exc: # pylint: disable=broad-exception-caught + print(f" (warning) could not delete conversation `{cid}`: {exc}") except Exception as exc: # pylint: disable=broad-exception-caught print(f" (warning) could not open OpenAI client for conversation cleanup: {exc}") if created_agent is not None: - _try_delete( - f"temporary agent `{created_agent.name}` v{created_agent.version}", - project_client.agents.delete_version, - agent_name=created_agent.name, - agent_version=created_agent.version, - ) + try: + project_client.agents.delete_version( + agent_name=created_agent.name, + agent_version=created_agent.version, + ) + print(f"Deleted temporary agent `{created_agent.name}` v{created_agent.version}.") + except Exception as exc: # pylint: disable=broad-exception-caught + print(f" (warning) could not delete agent: {exc}") From 7cad865defb518b16dcfb5e7cc0542445a8b8975 Mon Sep 17 00:00:00 2001 From: aprilkim Date: Sat, 30 May 2026 22:27:24 -0700 Subject: [PATCH 6/8] Compress persona and replace conversation literal with a loop Persona collapsed to 4 inline declarations (14 -> 5 lines). The nested SEEDING_CONVERSATIONS list-of-lists is replaced by a flat SEED_PROMPTS list plus NUM_CONVERSATIONS constant; the seeding loop cycles each conversation through the same prompts. Behavior unchanged - still 3 x 5 = 15 turns and 15 generated samples (live-tested). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ...et_generation_job_traces_for_evaluation.py | 58 +++++-------------- 1 file changed, 15 insertions(+), 43 deletions(-) diff --git a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py index a10257e2662f..228e5e1d9e25 100644 --- a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py +++ b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py @@ -72,46 +72,21 @@ # Short persona; covers only the topics the seeded prompts ask about. AGENT_INSTRUCTIONS = """\ -You are the Widgets & Gizmos customer-support agent. - -Returns: Unopened products may be returned within 30 days for a full refund. -Defective products may be returned within 90 days at no cost. Refunds take -5-7 business days. - -Warranty: Standard products carry a 1-year limited warranty. The Deluxe -Sprocket carries a 5-year warranty. Warranty repairs are free; we cover -return shipping. Repairs take 10-14 business days. - -Products: Standard Widget is $19.99 (bundle of 10 for $149.99). Deluxe -Sprocket is $79.99. - -If you do not know the answer, say so. Be concise. +Widgets & Gizmos support agent. Be concise. Say so if unsure. +Returns: unopened 30 days full refund; defective 90 days free; refunds 5-7 business days. +Warranty: Standard 1 year, Deluxe Sprocket 5 years; repairs free, we pay shipping, 10-14 days. +Products: Standard Widget $19.99 (10-pack $149.99); Deluxe Sprocket $79.99. """ -SEEDING_CONVERSATIONS: List[List[str]] = [ - [ - "Can I return a defective Standard Widget after 45 days?", - "How long does a refund take?", - "What about an unopened Standard Widget?", - "Do I pay return shipping?", - "Is there a restocking fee?", - ], - [ - "What is the warranty on the Deluxe Sprocket?", - "What does the warranty cover?", - "Do warranty repairs cost anything?", - "How long do warranty repairs take?", - "Who pays return shipping for a warranty claim?", - ], - [ - "How much is a Standard Widget?", - "Is there a bundle deal?", - "What is the Deluxe Sprocket price?", - "What products do you carry?", - "Do you sell accessories?", - ], +SEED_PROMPTS = [ + "Refund policy?", + "Warranty length?", + "Standard Widget price?", + "Any bundle deal?", + "Who pays shipping for warranty repairs?", ] +NUM_CONVERSATIONS = 3 # NUM_CONVERSATIONS * len(SEED_PROMPTS) must be >= max_samples (15). endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"] @@ -164,16 +139,13 @@ print(f"Agent created (id: {created_agent.id}, version: {created_agent.version}).") seed_start = datetime.now(tz=timezone.utc) - print( - f"Seed {len(SEEDING_CONVERSATIONS)} conversation(s) x " - f"{len(SEEDING_CONVERSATIONS[0])} turn(s) against the agent." - ) + print(f"Seed {NUM_CONVERSATIONS} conversation(s) x {len(SEED_PROMPTS)} turn(s) against the agent.") with project_client.get_openai_client() as openai_client: - for ci, arc in enumerate(SEEDING_CONVERSATIONS, start=1): + for ci in range(1, NUM_CONVERSATIONS + 1): conversation = openai_client.conversations.create() conversation_ids.append(conversation.id) - print(f" - conversation {ci}/{len(SEEDING_CONVERSATIONS)} (id: {conversation.id})") - for prompt in arc: + print(f" - conversation {ci}/{NUM_CONVERSATIONS} (id: {conversation.id})") + for prompt in SEED_PROMPTS: openai_client.responses.create( conversation=conversation.id, input=prompt, From d64140571c68992fa9e5e4705de6a92089d3dd3f Mon Sep 17 00:00:00 2001 From: aprilkim Date: Sun, 31 May 2026 09:56:05 -0700 Subject: [PATCH 7/8] Minimize self-contained traces sample to 1 seeded turn max_samples is a cap on generated samples, not a floor on input traces. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ...et_generation_job_traces_for_evaluation.py | 75 ++++++++----------- 1 file changed, 30 insertions(+), 45 deletions(-) diff --git a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py index 228e5e1d9e25..b61f37577f8c 100644 --- a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py +++ b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py @@ -46,7 +46,7 @@ import time import uuid from datetime import datetime, timedelta, timezone -from typing import List, Optional +from typing import Optional from dotenv import load_dotenv @@ -70,23 +70,13 @@ load_dotenv() -# Short persona; covers only the topics the seeded prompts ask about. -AGENT_INSTRUCTIONS = """\ -Widgets & Gizmos support agent. Be concise. Say so if unsure. -Returns: unopened 30 days full refund; defective 90 days free; refunds 5-7 business days. -Warranty: Standard 1 year, Deluxe Sprocket 5 years; repairs free, we pay shipping, 10-14 days. -Products: Standard Widget $19.99 (10-pack $149.99); Deluxe Sprocket $79.99. -""" - - -SEED_PROMPTS = [ - "Refund policy?", - "Warranty length?", - "Standard Widget price?", - "Any bundle deal?", - "Who pays shipping for warranty repairs?", -] -NUM_CONVERSATIONS = 3 # NUM_CONVERSATIONS * len(SEED_PROMPTS) must be >= max_samples (15). +# Minimal persona + prompt; one seeded turn is enough for the job to succeed +# (max_samples is the cap on generated samples, not a floor on input traces). +AGENT_INSTRUCTIONS = ( + "Widgets & Gizmos support agent. Be concise. " + "Refunds: unopened 30 days; defective 90 days; 5-7 business days to process." +) +SEED_PROMPT = "What is your refund policy?" endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"] @@ -115,7 +105,7 @@ ): created_agent = None - conversation_ids: List[str] = [] + created_conversation_id: Optional[str] = None submitted_job_id: Optional[str] = None created_dataset: Optional[DatasetVersion] = None @@ -139,24 +129,22 @@ print(f"Agent created (id: {created_agent.id}, version: {created_agent.version}).") seed_start = datetime.now(tz=timezone.utc) - print(f"Seed {NUM_CONVERSATIONS} conversation(s) x {len(SEED_PROMPTS)} turn(s) against the agent.") + print(f"Seed one conversation against the agent (prompt: {SEED_PROMPT!r}).") with project_client.get_openai_client() as openai_client: - for ci in range(1, NUM_CONVERSATIONS + 1): - conversation = openai_client.conversations.create() - conversation_ids.append(conversation.id) - print(f" - conversation {ci}/{NUM_CONVERSATIONS} (id: {conversation.id})") - for prompt in SEED_PROMPTS: - openai_client.responses.create( - conversation=conversation.id, - input=prompt, - extra_body={ - "agent_reference": { - "name": created_agent.name, - "id": created_agent.id, - "type": "agent_reference", - } - }, - ) + conversation = openai_client.conversations.create() + created_conversation_id = conversation.id + print(f" - conversation id: {conversation.id}") + openai_client.responses.create( + conversation=conversation.id, + input=SEED_PROMPT, + extra_body={ + "agent_reference": { + "name": created_agent.name, + "id": created_agent.id, + "type": "agent_reference", + } + }, + ) print( f"Wait {trace_ingestion_wait_seconds}s for Application Insights to ingest the emitted spans. " @@ -187,7 +175,8 @@ end_time=end_time, ), ], - # Service requires max_samples in [15, 1000]. + # Service requires max_samples in [15, 1000]. It's a cap on + # generated samples - one seeded trace turn is enough. options=TracesDataGenerationJobOptions(max_samples=15), output_options=DataGenerationJobOutputOptions(name=output_dataset_name), ), @@ -245,17 +234,13 @@ except Exception as exc: # pylint: disable=broad-exception-caught print(f" (warning) could not delete job: {exc}") - if conversation_ids: + if created_conversation_id is not None: try: with project_client.get_openai_client() as openai_client: - for cid in conversation_ids: - try: - openai_client.conversations.delete(conversation_id=cid) - print(f"Deleted seeded conversation `{cid}`.") - except Exception as exc: # pylint: disable=broad-exception-caught - print(f" (warning) could not delete conversation `{cid}`: {exc}") + openai_client.conversations.delete(conversation_id=created_conversation_id) + print(f"Deleted seeded conversation `{created_conversation_id}`.") except Exception as exc: # pylint: disable=broad-exception-caught - print(f" (warning) could not open OpenAI client for conversation cleanup: {exc}") + print(f" (warning) could not delete conversation: {exc}") if created_agent is not None: try: From 416fdd8cb3a881f1544b15358ba856202928ddb7 Mon Sep 17 00:00:00 2001 From: aprilkim Date: Sun, 31 May 2026 10:40:29 -0700 Subject: [PATCH 8/8] Drop client instrumentation, 'temporary' wording, optional env vars Prompt agents emit server-side traces to the project's connected App Insights, so client-side AIProjectInstrumentor + configure_azure_monitor are not required. Hardcode poll/wait constants and dataset name (still uniqueified via run id). Verified live: PASS in 231s, 1 sample generated, clean teardown. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ...et_generation_job_traces_for_evaluation.py | 77 ++++++------------- 1 file changed, 24 insertions(+), 53 deletions(-) diff --git a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py index b61f37577f8c..498a2e9bdca2 100644 --- a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py +++ b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py @@ -9,15 +9,16 @@ Generates an evaluation dataset from an agent's conversation traces. The sample is fully self-contained: - 1. Wires up Azure Monitor + AIProjectInstrumentor so agent calls emit - semantic GenAI spans (with content) to Application Insights. - 2. Creates a temporary agent and seeds spans with sample conversations. - 3. Waits for ingestion, then submits a `DataGenerationJob` + 1. Creates an agent and seeds spans with a sample conversation. + 2. Waits for ingestion, then submits a `DataGenerationJob` (scenario=EVALUATION, source=traces) that synthesizes Q/A pairs. - 4. Polls the job and fetches the resulting `DatasetVersion`. - 5. Cleans up the dataset, job, seeded conversations, and agent. + 3. Polls the job and fetches the resulting `DatasetVersion`. + 4. Cleans up the dataset, job, seeded conversations, and agent. - To adapt for an existing agent with recent traces, replace step 2 with + Prerequisite: the project must have an Application Insights resource + connected so the agent emits server-side traces. + + To adapt for an existing agent with recent traces, replace step 1 with your agent's name and skip the ingestion wait. USAGE: @@ -25,21 +26,13 @@ Before running the sample: - pip install "azure-ai-projects>=2.2.0" azure-identity python-dotenv \\ - azure-monitor-opentelemetry azure-core-tracing-opentelemetry + pip install "azure-ai-projects>=2.2.0" azure-identity python-dotenv Set these environment variables with your own values: 1) FOUNDRY_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as found in the overview page of your Microsoft Foundry project. 2) FOUNDRY_MODEL_NAME - Required. The Azure OpenAI deployment name used - to drive the temporary agent during trace seeding. - 3) DATASET_NAME - Optional. Output dataset name. Defaults to - `traces-eval-sample`. Service caps the rendered name at 50 chars - (the sample appends a unique run-id suffix). - 4) POLL_INTERVAL_SECONDS - Optional. Sleep between job status polls. - Defaults to 10. - 5) TRACE_INGESTION_WAIT_SECONDS - Optional. Wait after seeding for - Application Insights ingestion. Defaults to 180. + to drive the agent during trace seeding. """ import os @@ -64,8 +57,6 @@ TracesDataGenerationJobOptions, TracesDataGenerationJobSource, ) -from azure.ai.projects.telemetry import AIProjectInstrumentor -from azure.monitor.opentelemetry import configure_azure_monitor load_dotenv() @@ -81,20 +72,14 @@ endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"] model_deployment = os.environ["FOUNDRY_MODEL_NAME"] -dataset_name = os.environ.get("DATASET_NAME", "traces-eval-sample") -poll_interval_seconds = int(os.environ.get("POLL_INTERVAL_SECONDS", "10")) -trace_ingestion_wait_seconds = int(os.environ.get("TRACE_INGESTION_WAIT_SECONDS", "180")) +DATASET_NAME = "traces-eval-sample" +POLL_INTERVAL_SECONDS = 10 +TRACE_INGESTION_WAIT_SECONDS = 180 # Per-run id keeps repeated runs from colliding; output names are capped at 50 chars. run_id = f"{datetime.now(tz=timezone.utc).strftime('%y%m%d%H%M%S')}-{uuid.uuid4().hex[:4]}" -output_dataset_name = f"{dataset_name}-{run_id}" -if len(output_dataset_name) > 50: - raise ValueError( - f"Output dataset name `{output_dataset_name}` exceeds the 50-character service limit. " - f"Shorten DATASET_NAME (currently `{dataset_name}`) so that `-` fits within 50 characters." - ) - -agent_name = f"traces-eval-sample-{run_id}" +output_dataset_name = f"{DATASET_NAME}-{run_id}" +agent_name = f"{DATASET_NAME}-{run_id}" TERMINAL_STATUSES = {JobStatus.SUCCEEDED, JobStatus.FAILED, JobStatus.CANCELLED} @@ -110,18 +95,8 @@ created_dataset: Optional[DatasetVersion] = None try: - # 1. Configure Azure Monitor + GenAI instrumentation to emit spans with content. - # AIProjectInstrumentor gates on this env var at instrument() time; without it - # instrument() returns early and no spans flow. - os.environ["AZURE_EXPERIMENTAL_ENABLE_GENAI_TRACING"] = "true" - - print("Configure Azure Monitor exporter from the project's Application Insights connection.") - connection_string = project_client.telemetry.get_application_insights_connection_string() - configure_azure_monitor(connection_string=connection_string) - AIProjectInstrumentor().instrument(enable_content_recording=True) - - # 2. Create a temporary agent and seed traces. - print(f"Create temporary agent `{agent_name}` (model: `{model_deployment}`).") + # 1. Create an agent and seed traces. + print(f"Create agent `{agent_name}` (model: `{model_deployment}`).") created_agent = project_client.agents.create_version( agent_name=agent_name, definition=PromptAgentDefinition(model=model_deployment, instructions=AGENT_INSTRUCTIONS), @@ -146,14 +121,10 @@ }, ) - print( - f"Wait {trace_ingestion_wait_seconds}s for Application Insights to ingest the emitted spans. " - f"Override with TRACE_INGESTION_WAIT_SECONDS.", - flush=True, - ) - time.sleep(trace_ingestion_wait_seconds) + print(f"Wait {TRACE_INGESTION_WAIT_SECONDS}s for Application Insights to ingest the spans.", flush=True) + time.sleep(TRACE_INGESTION_WAIT_SECONDS) - # 3. Submit a data generation job that reads the agent's traces. + # 2. Submit a data generation job that reads the agent's traces. # Small backoff so the seeded spans fall inside the queried window. start_time = seed_start - timedelta(minutes=5) end_time = datetime.now(tz=timezone.utc) @@ -169,7 +140,7 @@ scenario=DataGenerationJobScenario.EVALUATION, sources=[ TracesDataGenerationJobSource( - description="Application Insights conversation traces for the temporary agent.", + description="Application Insights conversation traces for the agent.", agent_name=agent_name, start_time=start_time, end_time=end_time, @@ -187,7 +158,7 @@ print(f"Poll job `{job.id}` until it reaches a terminal state.", end="", flush=True) while job.status not in TERMINAL_STATUSES: - time.sleep(poll_interval_seconds) + time.sleep(POLL_INTERVAL_SECONDS) print(".", end="", flush=True) job = project_client.beta.datasets.get_generation_job(job_id=job.id) print() @@ -197,7 +168,7 @@ message = job.error.message if job.error is not None else "" raise RuntimeError(f"Job `{job.id}` ended with status `{job.status}`: {message}") - # 4. Resolve the generated dataset. + # 3. Resolve the generated dataset. outputs = (job.result.outputs if job.result is not None else None) or [] dataset_output = next( (o for o in outputs if isinstance(o, DatasetDataGenerationJobOutput)), None @@ -248,6 +219,6 @@ agent_name=created_agent.name, agent_version=created_agent.version, ) - print(f"Deleted temporary agent `{created_agent.name}` v{created_agent.version}.") + print(f"Deleted agent `{created_agent.name}` v{created_agent.version}.") except Exception as exc: # pylint: disable=broad-exception-caught print(f" (warning) could not delete agent: {exc}")