From 68980bc5c4f5374ce5e9342fdcd5320071eb462f Mon Sep 17 00:00:00 2001
From: aprilkim <aprilk@microsoft.com>
Date: Sat, 30 May 2026 20:55:29 -0700
Subject: [PATCH 1/8] [ai/azure-ai-projects] make traces-for-evaluation sample
 self-contained

Rewrite samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py so it works without prior setup. By default, the sample now creates a temporary Foundry agent, runs three multi-turn Widgets & Gizmos conversations against it with GenAI content tracing enabled (configure_azure_monitor + AIProjectInstrumentor with enable_content_recording=True), waits for App Insights ingestion, then submits the existing data-generation job over a window that exactly brackets the seeded traces. The temporary agent, seeded conversations, generated dataset, and data-generation job are all cleaned up in a best-effort finally block.

Users who already have an agent with traces can opt into bring-your-own-agent mode by setting FOUNDRY_AGENT_NAME; in that mode the sample skips agent creation, trace seeding, and ingestion wait and uses the existing FOUNDRY_TRACES_WINDOW_DAYS look-back window (default 7 days). New seeding knobs (TRACE_SEEDING_CONVERSATIONS, TRACE_SEEDING_TURNS, TRACE_INGESTION_WAIT_SECONDS) make timing tunable per environment.

Validated end-to-end against the build26-bug-bash project on gpt-5.1: self-contained run produced 15 generated samples and cleaned up all temporary resources successfully.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 ...et_generation_job_traces_for_evaluation.py | 497 ++++++++++++++----
 1 file changed, 407 insertions(+), 90 deletions(-)

diff --git a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py
index 9b2ca86a89bf..e7c60468b4c7 100644
--- a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py
+++ b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py
@@ -7,47 +7,79 @@
 """
 DESCRIPTION:
     Generates an evaluation dataset from an agent's recent conversation
-    traces. The sample:
+    traces. The sample runs in one of two modes:
 
-      1. Creates a `DataGenerationJob` (scenario=EVALUATION, type=traces) that
-         reads spans from Application Insights for an existing agent within a
+      * Self-contained mode (default): Creates a temporary Foundry agent,
+        runs a few sample conversations against it with GenAI content
+        tracing enabled so spans flow to Application Insights, waits for
+        ingestion, then runs the data generation job. The temporary agent
+        and conversations are deleted at the end. Use this mode to try the
+        sample without preparing anything in advance.
+      * Bring-your-own-agent mode (BYO): Set FOUNDRY_AGENT_NAME to point at
+        an existing agent that already has recent conversation traces. The
+        sample skips agent creation and trace seeding and uses your agent
+        as-is.
+
+    In both modes, the sample:
+      1. Creates a `DataGenerationJob` (scenario=EVALUATION, type=traces)
+         that reads spans from Application Insights for the agent within a
          time window and synthesizes question / answer pairs into a new
          versioned Dataset.
-      2. Polls the job to completion and resolves the resulting `DatasetVersion`.
-      3. Cleans up the generated dataset and the data generation job.
+      2. Polls the job to completion and resolves the resulting
+         `DatasetVersion`.
+      3. Cleans up the generated dataset, the data generation job, and
+         (in self-contained mode) the temporary agent and conversations.
 
-    The Traces source consumes existing telemetry, so no `model_options` are
-    required — the service derives samples directly from the agent's traces.
-    The agent must have at least one trace recorded within the configured
-    look-back window or the job will succeed with zero generated samples.
+    The Traces source consumes existing telemetry, so no `model_options`
+    are required — the service derives samples directly from the agent's
+    traces.
 
 USAGE:
     python sample_dataset_generation_job_traces_for_evaluation.py
 
     Before running the sample:
 
-    pip install "azure-ai-projects>=2.2.0" azure-identity python-dotenv
+    pip install "azure-ai-projects>=2.2.0" azure-identity python-dotenv \\
+        azure-monitor-opentelemetry azure-core-tracing-opentelemetry
+
+    (The two telemetry packages are only required for self-contained mode.)
 
     Set these environment variables with your own values:
-    1) FOUNDRY_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as found
-       in the overview page of your Microsoft Foundry project.
-    2) FOUNDRY_AGENT_NAME - Required. The name of an agent (Foundry Agent or
-       OpenTelemetry-instrumented third-party agent) that has recent
-       conversation traces in Application Insights.
-    3) DATASET_NAME - Optional. Name to assign to the generated output dataset.
-       Defaults to `traces-eval-sample`. The service caps the rendered output
-       name at 50 characters, so keep custom values short — the sample appends
-       a unique run id suffix.
-    4) FOUNDRY_TRACES_WINDOW_DAYS - Optional. How far back, in days, to look for
-       agent traces. Defaults to 7.
-    5) POLL_INTERVAL_SECONDS - Optional. Number of seconds to sleep between status
-       polls for the data generation job. Defaults to 10.
+    1) FOUNDRY_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as
+       found in the overview page of your Microsoft Foundry project.
+    2) FOUNDRY_AGENT_NAME - Optional. The name of an existing agent (Foundry
+       Agent or OpenTelemetry-instrumented third-party agent) that already
+       has recent conversation traces in Application Insights. If set, the
+       sample skips agent creation and trace seeding and uses this agent.
+    3) FOUNDRY_MODEL_NAME - Required for self-contained mode. The Azure OpenAI
+       deployment name used to drive the temporary agent during trace
+       seeding. Ignored when FOUNDRY_AGENT_NAME is set.
+    4) DATASET_NAME - Optional. Name to assign to the generated output
+       dataset. Defaults to `traces-eval-sample`. The service caps the
+       rendered output name at 50 characters, so keep custom values short —
+       the sample appends a unique run id suffix.
+    5) FOUNDRY_TRACES_WINDOW_DAYS - Optional. How far back, in days, to look
+       for agent traces when in BYO mode. Defaults to 7. Ignored in
+       self-contained mode (the sample uses an exact window covering the
+       seeded traces).
+    6) POLL_INTERVAL_SECONDS - Optional. Number of seconds to sleep between
+       status polls for the data generation job. Defaults to 10.
+    7) TRACE_SEEDING_CONVERSATIONS - Optional. Number of conversations to
+       seed in self-contained mode. Defaults to 3.
+    8) TRACE_SEEDING_TURNS - Optional. Turns per seeded conversation in
+       self-contained mode. Defaults to 5.
+    9) TRACE_INGESTION_WAIT_SECONDS - Optional. Seconds to wait after seeding
+       for Application Insights to ingest the emitted spans before
+       submitting the data generation job. Defaults to 180.
 """
 
+import importlib
 import os
+import sys
 import time
 import uuid
 from datetime import datetime, timedelta, timezone
+from typing import List, Optional
 
 from dotenv import load_dotenv
 
@@ -61,18 +93,113 @@
     DatasetDataGenerationJobOutput,
     DatasetVersion,
     JobStatus,
+    PromptAgentDefinition,
     TracesDataGenerationJobOptions,
     TracesDataGenerationJobSource,
 )
 
 load_dotenv()
 
+# Persona used when seeding traces in self-contained mode. Mirrors the
+# Widgets & Gizmos persona from
+# sample_dataset_generation_job_simpleqna_with_agent_source.py so the
+# generated traces have substantive multi-turn content the data generation
+# service can synthesize useful eval samples from.
+AGENT_INSTRUCTIONS = """\
+You are the Widgets & Gizmos customer-support agent. Help customers with
+returns, warranty claims, repairs, product specifications, compatibility,
+and ordering for Widgets, Gizmos, Sprockets, and accessories.
+
+Use this knowledge base when answering. Cite the relevant policy or spec
+directly when you can.
+
+Returns
+  * Unopened products may be returned within 30 days for a full refund.
+  * Opened products may be returned within 14 days for a refund minus a
+    10% restocking fee. Defective products may be returned within 90 days
+    at no cost.
+  * Refunds are processed within 5-7 business days after the return is
+    received and inspected.
+  * Items lost in shipping should be reported within 21 days of the order
+    date; we re-ship at no cost.
+
+Warranty
+  * Standard products carry a 1-year limited warranty against
+    manufacturing defects.
+  * The Deluxe Sprocket carries a 5-year limited warranty.
+  * Warranty repairs are free. Customer ships the unit to us prepaid; we
+    cover return shipping. Typical turnaround is 10-14 business days.
+
+Specifications
+  * Standard Widget: 4 inches, blue or red, weighs 6oz, made of aluminum.
+  * Compact Widget: 2 inches, gray only, weighs 3oz, made of aluminum.
+  * Gizmo: 6 inches, available in green, weighs 10oz, made of stainless
+    steel and ABS plastic. Compatible with all Sprocket Adapter v2 mounts.
+  * Sprocket Adapter v2: universal mount that fits Widgets, Gizmos, and
+    third-party 1/4-20 hardware.
+
+Pricing & bundles
+  * Standard Widget: $19.99 each, bundle of 10 for $149.99.
+  * Gizmo: $34.99 each, bundle of 5 for $129.99.
+  * Deluxe Sprocket: $79.99 each.
+
+If you do not know the answer, say so and offer to escalate. Be concise.
+"""
+
+# Multi-turn conversation arcs used to seed traces. Each inner list is one
+# conversation; the sample runs each turn against the temporary agent.
+SEEDING_CONVERSATION_ARCS = [
+    [
+        "Hi, I need to return a defective Standard Widget.",
+        "I bought it 45 days ago. Is it still eligible for a refund?",
+        "What about a Gizmo I ordered but never received - it has been 3 weeks?",
+        "Can I get a refund instead of a replacement shipment?",
+        "How long will the refund take to show up on my card?",
+    ],
+    [
+        "Does the Deluxe Sprocket come with a warranty?",
+        "What exactly does the warranty cover?",
+        "My Deluxe Sprocket stopped turning after 6 months - what should I do?",
+        "Do I have to pay for return shipping on a warranty claim?",
+        "How long do warranty repairs usually take?",
+    ],
+    [
+        "What is the difference between a Standard Widget and a Compact Widget?",
+        "Is the Compact Widget compatible with the Sprocket Adapter v2?",
+        "What colors and sizes are Gizmos available in?",
+        "How much is a bundle of 10 Standard Widgets?",
+        "Do you carry any third-party accessories that fit the Sprocket Adapter v2?",
+    ],
+]
+
 endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"]
-agent_name = os.environ["FOUNDRY_AGENT_NAME"]
+provided_agent_name = os.environ.get("FOUNDRY_AGENT_NAME", "").strip()
 dataset_name = os.environ.get("DATASET_NAME", "traces-eval-sample")
-traces_window_days = int(os.environ.get("FOUNDRY_TRACES_WINDOW_DAYS", "7"))
 poll_interval_seconds = int(os.environ.get("POLL_INTERVAL_SECONDS", "10"))
 
+# Self-contained mode is enabled unless the user pointed at an existing agent.
+seed_traces = not provided_agent_name
+
+# Window default differs by mode: in self-contained mode we compute the
+# window exactly around the seeded traces (so this knob is ignored).
+traces_window_days = int(os.environ.get("FOUNDRY_TRACES_WINDOW_DAYS", "7"))
+
+# Seeding knobs (only used when seed_traces is True).
+trace_seeding_conversations = int(
+    os.environ.get("TRACE_SEEDING_CONVERSATIONS", str(len(SEEDING_CONVERSATION_ARCS)))
+)
+trace_seeding_turns = int(
+    os.environ.get("TRACE_SEEDING_TURNS", str(len(SEEDING_CONVERSATION_ARCS[0])))
+)
+trace_ingestion_wait_seconds = int(os.environ.get("TRACE_INGESTION_WAIT_SECONDS", "180"))
+
+if seed_traces and "FOUNDRY_MODEL_NAME" not in os.environ:
+    raise EnvironmentError(
+        "Self-contained mode requires FOUNDRY_MODEL_NAME (the Azure OpenAI deployment "
+        "name used to drive the temporary agent). Either set FOUNDRY_MODEL_NAME or set "
+        "FOUNDRY_AGENT_NAME to use an existing agent with traces."
+    )
+
 # Unique per-run output dataset name so repeated runs do not collide.
 # Output names are capped at 50 characters by the service.
 run_id = f"{datetime.now(tz=timezone.utc).strftime('%y%m%d%H%M%S')}-{uuid.uuid4().hex[:4]}"
@@ -83,78 +210,268 @@
         f"Lower DATASET_NAME (currently `{dataset_name}`) so that `<DATASET_NAME>-<run id>` fits within 50 characters."
     )
 
-# Trace look-back window: now - `traces_window_days` ... now.
-end_time = datetime.now(tz=timezone.utc)
-start_time = end_time - timedelta(days=traces_window_days)
+# Agent name used to read traces. In self-contained mode we use a unique
+# per-run name so concurrent runs do not collide and so we know any matched
+# traces belong to this run.
+agent_name = provided_agent_name or f"traces-eval-sample-{run_id}"
 
 TERMINAL_STATUSES = {JobStatus.SUCCEEDED, JobStatus.FAILED, JobStatus.CANCELLED}
 
+
+def _safe_console(text: str) -> str:
+    """Encode `text` so it always prints on the active stdout encoding.
+
+    Some Windows consoles default to cp1252, which cannot encode characters
+    the model may emit (e.g. smart quotes, non-breaking hyphens). We replace
+    any unencodable code points with `?` so a preview line never crashes the
+    sample.
+    """
+    encoding = getattr(sys.stdout, "encoding", None) or "utf-8"
+    return text.encode(encoding, errors="replace").decode(encoding, errors="replace")
+
+
+def _seed_agent_traces(
+    project_client: AIProjectClient,
+    agent_name_to_use: str,
+    agent_id_to_use: str,
+    conversation_count: int,
+    turns_per_conversation: int,
+    conversation_ids: List[str],
+) -> None:
+    """Run a few conversations against the agent so GenAI spans flow to App Insights.
+
+    Created conversation IDs are appended to `conversation_ids` as each
+    conversation is created, so the caller can clean them up even if seeding
+    raises mid-way through.
+    """
+    arcs = SEEDING_CONVERSATION_ARCS
+    with project_client.get_openai_client() as openai_client:
+        for ci in range(conversation_count):
+            arc = arcs[ci % len(arcs)]
+            conversation = openai_client.conversations.create()
+            conversation_ids.append(conversation.id)
+            print(f"  - conversation {ci + 1}/{conversation_count} (id: {conversation.id})")
+            for ti in range(turns_per_conversation):
+                prompt = arc[ti % len(arc)]
+                response = openai_client.responses.create(
+                    conversation=conversation.id,
+                    input=prompt,
+                    extra_body={
+                        "agent_reference": {
+                            "name": agent_name_to_use,
+                            "id": agent_id_to_use,
+                            "type": "agent_reference",
+                        }
+                    },
+                )
+                preview = (response.output_text or "").replace("\n", " ")
+                if len(preview) > 80:
+                    preview = preview[:77] + "..."
+                print(_safe_console(f"      turn {ti + 1}: {prompt}"))
+                print(_safe_console(f"        response: {preview}"))
+
+
+mode_label = (
+    "self-contained (will create a temporary agent and seed traces)"
+    if seed_traces
+    else f"bring-your-own-agent (`{provided_agent_name}`)"
+)
+print(f"Mode: {mode_label}.")
+
 with (
     DefaultAzureCredential() as credential,
     AIProjectClient(endpoint=endpoint, credential=credential) as project_client,
 ):
 
-    # ------------------------------------------------------------------
-    # 1. Submit a data generation job that reads agent traces.
-    # ------------------------------------------------------------------
-    print(f"Create a data generation job from traces for agent `{agent_name}` (window: {traces_window_days} day(s)).")
-    job = DataGenerationJob(
-        inputs=DataGenerationJobInputs(
-            name=f"traces-eval-{run_id}",
-            scenario=DataGenerationJobScenario.EVALUATION,
-            sources=[
-                TracesDataGenerationJobSource(
-                    description="Application Insights conversation traces for the Foundry agent.",
-                    agent_name=agent_name,
-                    start_time=start_time,
-                    end_time=end_time,
+    created_agent = None
+    conversation_ids: List[str] = []
+    seed_start: Optional[datetime] = None
+
+    try:
+        if seed_traces:
+            # --------------------------------------------------------------
+            # 0a. Wire up Azure Monitor + GenAI instrumentation so calls to
+            #     responses.create emit semantic GenAI spans (with message
+            #     content) to Application Insights.
+            # --------------------------------------------------------------
+            try:
+                configure_azure_monitor = importlib.import_module(
+                    "azure.monitor.opentelemetry"
+                ).configure_azure_monitor
+                AIProjectInstrumentor = importlib.import_module(
+                    "azure.ai.projects.telemetry"
+                ).AIProjectInstrumentor
+            except ImportError as exc:
+                raise ImportError(
+                    "Self-contained mode requires the `azure-monitor-opentelemetry` and "
+                    "`azure-core-tracing-opentelemetry` packages. Install them with "
+                    "`pip install azure-monitor-opentelemetry azure-core-tracing-opentelemetry` "
+                    "or set FOUNDRY_AGENT_NAME to use an existing agent with traces."
+                ) from exc
+
+            # AIProjectInstrumentor requires this env var be set BEFORE
+            # instrument() is called. We force it on (not setdefault) so the
+            # temporary agent's calls always produce GenAI spans the data-gen
+            # service can read.
+            os.environ["AZURE_EXPERIMENTAL_ENABLE_GENAI_TRACING"] = "true"
+
+            print("Fetch Application Insights connection string and configure Azure Monitor exporter.")
+            connection_string = project_client.telemetry.get_application_insights_connection_string()
+            configure_azure_monitor(connection_string=connection_string)
+            AIProjectInstrumentor().instrument(enable_content_recording=True)
+
+            # --------------------------------------------------------------
+            # 0b. Create the temporary agent.
+            # --------------------------------------------------------------
+            model_deployment = os.environ["FOUNDRY_MODEL_NAME"]
+            print(f"Create temporary agent `{agent_name}` (model: `{model_deployment}`).")
+            created_agent = project_client.agents.create_version(
+                agent_name=agent_name,
+                definition=PromptAgentDefinition(
+                    model=model_deployment,
+                    instructions=AGENT_INSTRUCTIONS,
+                ),
+            )
+            print(
+                f"Agent created (id: {created_agent.id}, name: {created_agent.name}, "
+                f"version: {created_agent.version})."
+            )
+
+            # --------------------------------------------------------------
+            # 0c. Seed traces by running a few conversations against the agent.
+            # --------------------------------------------------------------
+            seed_start = datetime.now(tz=timezone.utc)
+            print(
+                f"Seed {trace_seeding_conversations} conversation(s) x "
+                f"{trace_seeding_turns} turn(s) against the agent so spans flow to Application Insights."
+            )
+            _seed_agent_traces(
+                project_client=project_client,
+                agent_name_to_use=created_agent.name,
+                agent_id_to_use=created_agent.id,
+                conversation_count=trace_seeding_conversations,
+                turns_per_conversation=trace_seeding_turns,
+                conversation_ids=conversation_ids,
+            )
+
+            # Flush any buffered spans so the only delay we wait for below is
+            # ingestion delay, not exporter batching delay.
+            try:
+                from opentelemetry import trace as _otel_trace  # pylint: disable=import-outside-toplevel
+
+                tracer_provider = _otel_trace.get_tracer_provider()
+                force_flush = getattr(tracer_provider, "force_flush", None)
+                if callable(force_flush):
+                    force_flush()
+            except Exception as exc:  # pylint: disable=broad-exception-caught
+                print(f"  (warning) could not force-flush tracer provider: {exc}")
+
+            print(
+                f"Wait {trace_ingestion_wait_seconds}s for Application Insights to ingest the "
+                f"emitted spans. Override with TRACE_INGESTION_WAIT_SECONDS.",
+                flush=True,
+            )
+            time.sleep(trace_ingestion_wait_seconds)
+
+        # ------------------------------------------------------------------
+        # 1. Submit a data generation job that reads agent traces.
+        # ------------------------------------------------------------------
+        if seed_traces and seed_start is not None:
+            # Window covers a small backoff before seeding through "now", which
+            # guarantees the seeded spans fall inside the queried window.
+            start_time = seed_start - timedelta(minutes=5)
+            end_time = datetime.now(tz=timezone.utc)
+        else:
+            # BYO mode: use the user-configurable look-back window.
+            end_time = datetime.now(tz=timezone.utc)
+            start_time = end_time - timedelta(days=traces_window_days)
+
+        print(
+            f"Create a data generation job from traces for agent `{agent_name}` "
+            f"(window: {start_time.isoformat()} .. {end_time.isoformat()})."
+        )
+        job = DataGenerationJob(
+            inputs=DataGenerationJobInputs(
+                name=f"traces-eval-{run_id}",
+                scenario=DataGenerationJobScenario.EVALUATION,
+                sources=[
+                    TracesDataGenerationJobSource(
+                        description="Application Insights conversation traces for the Foundry agent.",
+                        agent_name=agent_name,
+                        start_time=start_time,
+                        end_time=end_time,
+                    ),
+                ],
+                options=TracesDataGenerationJobOptions(
+                    # Service requires max_samples to be between 15 and 1000.
+                    max_samples=15,
                 ),
-            ],
-            options=TracesDataGenerationJobOptions(
-                # Service requires max_samples to be between 15 and 1000.
-                max_samples=15,
+                output_options=DataGenerationJobOutputOptions(name=output_dataset_name),
             ),
-            output_options=DataGenerationJobOutputOptions(name=output_dataset_name),
-        ),
-    )
-    job = project_client.beta.datasets.create_generation_job(job=job)
-    print(f"Created data generation job `{job.id}` (status: `{job.status}`).")
-
-    print(f"Poll job `{job.id}` until it reaches a terminal state.", end="", flush=True)
-    while True:
-        job = project_client.beta.datasets.get_generation_job(job_id=job.id)
-        if job.status in TERMINAL_STATUSES:
-            break
-        time.sleep(poll_interval_seconds)
-        print(".", end="", flush=True)
-    print()
-    print(f"Final job status: `{job.status}`.")
-
-    if job.status != JobStatus.SUCCEEDED:
-        message = job.error.message if job.error is not None else "<no error message>"
-        raise RuntimeError(f"Job `{job.id}` ended with status `{job.status}`: {message}")
-
-    # Locate the Dataset output produced by the job.
-    output_name: str = ""
-    output_version: str = ""
-    for output in (job.result.outputs if job.result is not None else None) or []:
-        if isinstance(output, DatasetDataGenerationJobOutput):
-            output_name = output.name or ""
-            output_version = output.version or ""
-            break
-    if not output_name or not output_version:
-        raise RuntimeError(f"Job `{job.id}` did not produce a dataset output.")
-
-    dataset: DatasetVersion = project_client.datasets.get(name=output_name, version=output_version)
-    print(f"Generated dataset: name=`{dataset.name}` version=`{dataset.version}` id=`{dataset.id}`")
-    if job.result is not None and job.result.generated_samples is not None:
-        print(f"Generated samples: {job.result.generated_samples}")
-
-    # ------------------------------------------------------------------
-    # 2. Clean up.
-    # ------------------------------------------------------------------
-    print(f"Delete the generated dataset `{dataset.name}` v{dataset.version}.")
-    project_client.datasets.delete(name=dataset.name or "", version=dataset.version or "")
-
-    print(f"Delete the data generation job `{job.id}`.")
-    project_client.beta.datasets.delete_generation_job(job_id=job.id)
+        )
+        job = project_client.beta.datasets.create_generation_job(job=job)
+        print(f"Created data generation job `{job.id}` (status: `{job.status}`).")
+
+        print(f"Poll job `{job.id}` until it reaches a terminal state.", end="", flush=True)
+        while True:
+            job = project_client.beta.datasets.get_generation_job(job_id=job.id)
+            if job.status in TERMINAL_STATUSES:
+                break
+            time.sleep(poll_interval_seconds)
+            print(".", end="", flush=True)
+        print()
+        print(f"Final job status: `{job.status}`.")
+
+        if job.status != JobStatus.SUCCEEDED:
+            message = job.error.message if job.error is not None else "<no error message>"
+            raise RuntimeError(f"Job `{job.id}` ended with status `{job.status}`: {message}")
+
+        # Locate the Dataset output produced by the job.
+        output_name: str = ""
+        output_version: str = ""
+        for output in (job.result.outputs if job.result is not None else None) or []:
+            if isinstance(output, DatasetDataGenerationJobOutput):
+                output_name = output.name or ""
+                output_version = output.version or ""
+                break
+        if not output_name or not output_version:
+            raise RuntimeError(f"Job `{job.id}` did not produce a dataset output.")
+
+        dataset: DatasetVersion = project_client.datasets.get(name=output_name, version=output_version)
+        print(f"Generated dataset: name=`{dataset.name}` version=`{dataset.version}` id=`{dataset.id}`")
+        if job.result is not None and job.result.generated_samples is not None:
+            print(f"Generated samples: {job.result.generated_samples}")
+
+        # ------------------------------------------------------------------
+        # 2. Clean up dataset + job.
+        # ------------------------------------------------------------------
+        print(f"Delete the generated dataset `{dataset.name}` v{dataset.version}.")
+        project_client.datasets.delete(name=dataset.name or "", version=dataset.version or "")
+
+        print(f"Delete the data generation job `{job.id}`.")
+        project_client.beta.datasets.delete_generation_job(job_id=job.id)
+
+    finally:
+        # Best-effort cleanup of the temporary agent and seeded conversations.
+        # Wrap each step so a failure in one does not skip the others, and so
+        # cleanup never masks the real exception that brought us here.
+        if conversation_ids:
+            try:
+                with project_client.get_openai_client() as openai_client:
+                    for cid in conversation_ids:
+                        try:
+                            openai_client.conversations.delete(conversation_id=cid)
+                            print(f"Deleted seeded conversation `{cid}`.")
+                        except Exception as exc:  # pylint: disable=broad-exception-caught
+                            print(f"  (warning) could not delete conversation `{cid}`: {exc}")
+            except Exception as exc:  # pylint: disable=broad-exception-caught
+                print(f"  (warning) could not open OpenAI client for conversation cleanup: {exc}")
+
+        if created_agent is not None:
+            try:
+                project_client.agents.delete_version(
+                    agent_name=created_agent.name, agent_version=created_agent.version
+                )
+                print(f"Deleted temporary agent `{created_agent.name}` v{created_agent.version}.")
+            except Exception as exc:  # pylint: disable=broad-exception-caught
+                print(f"  (warning) could not delete temporary agent `{created_agent.name}`: {exc}")

From 6819189524143da6c9265437bfb7031e8f595003 Mon Sep 17 00:00:00 2001
From: aprilkim <aprilk@microsoft.com>
Date: Sat, 30 May 2026 21:09:51 -0700
Subject: [PATCH 2/8] [ai/azure-ai-projects] move dataset+job cleanup into
 finally (review)

Track submitted_job_id and created_dataset before the try block and move the dataset and job deletes into finally, alongside the existing conversation and agent cleanup. Previously these two deletes lived inside try, so a polling failure, dataset-get failure, or any exception between job creation and the success-path deletes would leak the data-generation job (and possibly the dataset) on the unhappy path. Each step is now wrapped in its own best-effort try/except so a failure in one does not skip the others, and so cleanup never masks the real exception. Live-tested happy path against build26-bug-bash on gpt-5.1: 15 generated samples, all 5 resources cleaned up via finally, exit 0.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 ...et_generation_job_traces_for_evaluation.py | 41 ++++++++++++++-----
 1 file changed, 30 insertions(+), 11 deletions(-)

diff --git a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py
index e7c60468b4c7..1374f29dd62d 100644
--- a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py
+++ b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py
@@ -286,6 +286,8 @@ def _seed_agent_traces(
     created_agent = None
     conversation_ids: List[str] = []
     seed_start: Optional[datetime] = None
+    submitted_job_id: Optional[str] = None
+    created_dataset: Optional[DatasetVersion] = None
 
     try:
         if seed_traces:
@@ -410,6 +412,7 @@ def _seed_agent_traces(
             ),
         )
         job = project_client.beta.datasets.create_generation_job(job=job)
+        submitted_job_id = job.id
         print(f"Created data generation job `{job.id}` (status: `{job.status}`).")
 
         print(f"Poll job `{job.id}` until it reaches a terminal state.", end="", flush=True)
@@ -438,23 +441,39 @@ def _seed_agent_traces(
             raise RuntimeError(f"Job `{job.id}` did not produce a dataset output.")
 
         dataset: DatasetVersion = project_client.datasets.get(name=output_name, version=output_version)
+        created_dataset = dataset
         print(f"Generated dataset: name=`{dataset.name}` version=`{dataset.version}` id=`{dataset.id}`")
         if job.result is not None and job.result.generated_samples is not None:
             print(f"Generated samples: {job.result.generated_samples}")
 
-        # ------------------------------------------------------------------
-        # 2. Clean up dataset + job.
-        # ------------------------------------------------------------------
-        print(f"Delete the generated dataset `{dataset.name}` v{dataset.version}.")
-        project_client.datasets.delete(name=dataset.name or "", version=dataset.version or "")
+    finally:
+        # Best-effort cleanup. Each step is wrapped in its own try/except so a
+        # failure in one does not skip the others, and so cleanup never masks
+        # the real exception that brought us here. Order is outputs -> producers:
+        # dataset -> job -> seeded conversations -> temporary agent.
+        if created_dataset is not None:
+            try:
+                print(
+                    f"Delete the generated dataset `{created_dataset.name}` v{created_dataset.version}."
+                )
+                project_client.datasets.delete(
+                    name=created_dataset.name or "", version=created_dataset.version or ""
+                )
+            except Exception as exc:  # pylint: disable=broad-exception-caught
+                print(
+                    f"  (warning) could not delete generated dataset "
+                    f"`{created_dataset.name}` v{created_dataset.version}: {exc}"
+                )
 
-        print(f"Delete the data generation job `{job.id}`.")
-        project_client.beta.datasets.delete_generation_job(job_id=job.id)
+        if submitted_job_id is not None:
+            try:
+                print(f"Delete the data generation job `{submitted_job_id}`.")
+                project_client.beta.datasets.delete_generation_job(job_id=submitted_job_id)
+            except Exception as exc:  # pylint: disable=broad-exception-caught
+                print(
+                    f"  (warning) could not delete data generation job `{submitted_job_id}`: {exc}"
+                )
 
-    finally:
-        # Best-effort cleanup of the temporary agent and seeded conversations.
-        # Wrap each step so a failure in one does not skip the others, and so
-        # cleanup never masks the real exception that brought us here.
         if conversation_ids:
             try:
                 with project_client.get_openai_client() as openai_client:

From ec425c480c43575aaf7cfd6c83258eda2f17b40f Mon Sep 17 00:00:00 2001
From: aprilkim <aprilk@microsoft.com>
Date: Sat, 30 May 2026 21:39:18 -0700
Subject: [PATCH 3/8] [ai/azure-ai-projects] simplify traces-for-evaluation
 sample (review)

Drops the bring-your-own-agent mode and trims supporting plumbing so the sample is easier for a first-time reader to follow. Was 439 lines; now 282 (sister sample sample_dataset_generation_job_simpleqna_with_agent_source.py is 180).

Changes:

- Self-contained mode only. The if seed_traces / else split, the BYO env vars (FOUNDRY_AGENT_NAME, FOUNDRY_TRACES_WINDOW_DAYS, TRACE_SEEDING_CONVERSATIONS, TRACE_SEEDING_TURNS), and the importlib.import_module dance for optional telemetry deps are gone. A 4-line note in the docstring tells BYO users which block to replace.

- Imports azure.monitor.opentelemetry and azure.ai.projects.telemetry directly at the top of the file.

- Shrinks AGENT_INSTRUCTIONS from ~40 lines to ~15 with only the policies the seeded prompts actually ask about.

- Drops the _safe_console helper and the per-turn preview prints. Cleanup output is still printed.

- Drops the opentelemetry force_flush try/except; the 180s ingestion wait covers exporter batching too.

- Replaces the four per-resource try/except cleanup blocks with a small _try_delete(label, fn, *args, **kwargs) helper.

Live-tested happy path against build26-bug-bash on gpt-5.1: 15 generated samples, all 6 resources cleaned up via finally, exit 0.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 ...et_generation_job_traces_for_evaluation.py | 552 ++++++------------
 1 file changed, 191 insertions(+), 361 deletions(-)

diff --git a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py
index 1374f29dd62d..8b5f06fd59b0 100644
--- a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py
+++ b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py
@@ -6,33 +6,26 @@
 
 """
 DESCRIPTION:
-    Generates an evaluation dataset from an agent's recent conversation
-    traces. The sample runs in one of two modes:
-
-      * Self-contained mode (default): Creates a temporary Foundry agent,
-        runs a few sample conversations against it with GenAI content
-        tracing enabled so spans flow to Application Insights, waits for
-        ingestion, then runs the data generation job. The temporary agent
-        and conversations are deleted at the end. Use this mode to try the
-        sample without preparing anything in advance.
-      * Bring-your-own-agent mode (BYO): Set FOUNDRY_AGENT_NAME to point at
-        an existing agent that already has recent conversation traces. The
-        sample skips agent creation and trace seeding and uses your agent
-        as-is.
-
-    In both modes, the sample:
-      1. Creates a `DataGenerationJob` (scenario=EVALUATION, type=traces)
-         that reads spans from Application Insights for the agent within a
-         time window and synthesizes question / answer pairs into a new
-         versioned Dataset.
-      2. Polls the job to completion and resolves the resulting
-         `DatasetVersion`.
-      3. Cleans up the generated dataset, the data generation job, and
-         (in self-contained mode) the temporary agent and conversations.
-
-    The Traces source consumes existing telemetry, so no `model_options`
-    are required — the service derives samples directly from the agent's
-    traces.
+    Generates an evaluation dataset from an agent's conversation traces.
+    The sample is fully self-contained:
+
+      1. Wires up Azure Monitor + the AIProjectInstrumentor so the temporary
+         agent's calls emit semantic GenAI spans (with message content) to
+         Application Insights.
+      2. Creates a temporary Foundry agent and runs a few sample
+         conversations against it so spans flow to Application Insights.
+      3. Waits for ingestion, then submits a `DataGenerationJob`
+         (scenario=EVALUATION, source=traces) that synthesizes question/
+         answer pairs from those spans.
+      4. Polls the job, fetches the resulting `DatasetVersion`, and prints
+         the count of generated samples.
+      5. Cleans up the dataset, job, seeded conversations, and the
+         temporary agent.
+
+    To run against an existing agent that already has recent traces in
+    Application Insights, replace the seeding block (step 2) with your
+    agent's name and skip the ingestion wait. The data-generation API call
+    (step 3) is the same.
 
 USAGE:
     python sample_dataset_generation_job_traces_for_evaluation.py
@@ -42,44 +35,27 @@
     pip install "azure-ai-projects>=2.2.0" azure-identity python-dotenv \\
         azure-monitor-opentelemetry azure-core-tracing-opentelemetry
 
-    (The two telemetry packages are only required for self-contained mode.)
-
     Set these environment variables with your own values:
     1) FOUNDRY_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as
        found in the overview page of your Microsoft Foundry project.
-    2) FOUNDRY_AGENT_NAME - Optional. The name of an existing agent (Foundry
-       Agent or OpenTelemetry-instrumented third-party agent) that already
-       has recent conversation traces in Application Insights. If set, the
-       sample skips agent creation and trace seeding and uses this agent.
-    3) FOUNDRY_MODEL_NAME - Required for self-contained mode. The Azure OpenAI
-       deployment name used to drive the temporary agent during trace
-       seeding. Ignored when FOUNDRY_AGENT_NAME is set.
-    4) DATASET_NAME - Optional. Name to assign to the generated output
+    2) FOUNDRY_MODEL_NAME - Required. The Azure OpenAI deployment name used
+       to drive the temporary agent during trace seeding.
+    3) DATASET_NAME - Optional. Name to assign to the generated output
        dataset. Defaults to `traces-eval-sample`. The service caps the
-       rendered output name at 50 characters, so keep custom values short —
+       rendered output name at 50 characters, so keep custom values short -
        the sample appends a unique run id suffix.
-    5) FOUNDRY_TRACES_WINDOW_DAYS - Optional. How far back, in days, to look
-       for agent traces when in BYO mode. Defaults to 7. Ignored in
-       self-contained mode (the sample uses an exact window covering the
-       seeded traces).
-    6) POLL_INTERVAL_SECONDS - Optional. Number of seconds to sleep between
-       status polls for the data generation job. Defaults to 10.
-    7) TRACE_SEEDING_CONVERSATIONS - Optional. Number of conversations to
-       seed in self-contained mode. Defaults to 3.
-    8) TRACE_SEEDING_TURNS - Optional. Turns per seeded conversation in
-       self-contained mode. Defaults to 5.
-    9) TRACE_INGESTION_WAIT_SECONDS - Optional. Seconds to wait after seeding
-       for Application Insights to ingest the emitted spans before
+    4) POLL_INTERVAL_SECONDS - Optional. Seconds to sleep between status
+       polls for the data generation job. Defaults to 10.
+    5) TRACE_INGESTION_WAIT_SECONDS - Optional. Seconds to wait after
+       seeding for Application Insights to ingest the emitted spans before
        submitting the data generation job. Defaults to 180.
 """
 
-import importlib
 import os
-import sys
 import time
 import uuid
 from datetime import datetime, timedelta, timezone
-from typing import List, Optional
+from typing import Callable, List, Optional
 
 from dotenv import load_dotenv
 
@@ -97,186 +73,89 @@
     TracesDataGenerationJobOptions,
     TracesDataGenerationJobSource,
 )
+from azure.ai.projects.telemetry import AIProjectInstrumentor
+from azure.monitor.opentelemetry import configure_azure_monitor
 
 load_dotenv()
 
-# Persona used when seeding traces in self-contained mode. Mirrors the
-# Widgets & Gizmos persona from
-# sample_dataset_generation_job_simpleqna_with_agent_source.py so the
-# generated traces have substantive multi-turn content the data generation
-# service can synthesize useful eval samples from.
+
+# Short persona used to make seeded traces look like real customer-support
+# conversations. The data-gen service synthesizes eval samples from these
+# traces, so the persona just needs enough domain detail to answer the
+# seeding prompts confidently.
 AGENT_INSTRUCTIONS = """\
-You are the Widgets & Gizmos customer-support agent. Help customers with
-returns, warranty claims, repairs, product specifications, compatibility,
-and ordering for Widgets, Gizmos, Sprockets, and accessories.
-
-Use this knowledge base when answering. Cite the relevant policy or spec
-directly when you can.
-
-Returns
-  * Unopened products may be returned within 30 days for a full refund.
-  * Opened products may be returned within 14 days for a refund minus a
-    10% restocking fee. Defective products may be returned within 90 days
-    at no cost.
-  * Refunds are processed within 5-7 business days after the return is
-    received and inspected.
-  * Items lost in shipping should be reported within 21 days of the order
-    date; we re-ship at no cost.
-
-Warranty
-  * Standard products carry a 1-year limited warranty against
-    manufacturing defects.
-  * The Deluxe Sprocket carries a 5-year limited warranty.
-  * Warranty repairs are free. Customer ships the unit to us prepaid; we
-    cover return shipping. Typical turnaround is 10-14 business days.
-
-Specifications
-  * Standard Widget: 4 inches, blue or red, weighs 6oz, made of aluminum.
-  * Compact Widget: 2 inches, gray only, weighs 3oz, made of aluminum.
-  * Gizmo: 6 inches, available in green, weighs 10oz, made of stainless
-    steel and ABS plastic. Compatible with all Sprocket Adapter v2 mounts.
-  * Sprocket Adapter v2: universal mount that fits Widgets, Gizmos, and
-    third-party 1/4-20 hardware.
-
-Pricing & bundles
-  * Standard Widget: $19.99 each, bundle of 10 for $149.99.
-  * Gizmo: $34.99 each, bundle of 5 for $129.99.
-  * Deluxe Sprocket: $79.99 each.
-
-If you do not know the answer, say so and offer to escalate. Be concise.
+You are the Widgets & Gizmos customer-support agent.
+
+Returns: Unopened products may be returned within 30 days for a full refund.
+Defective products may be returned within 90 days at no cost. Refunds take
+5-7 business days.
+
+Warranty: Standard products carry a 1-year limited warranty. The Deluxe
+Sprocket carries a 5-year warranty. Warranty repairs are free; we cover
+return shipping. Repairs take 10-14 business days.
+
+Products: Standard Widget is $19.99 (bundle of 10 for $149.99). Deluxe
+Sprocket is $79.99.
+
+If you do not know the answer, say so. Be concise.
 """
 
-# Multi-turn conversation arcs used to seed traces. Each inner list is one
-# conversation; the sample runs each turn against the temporary agent.
-SEEDING_CONVERSATION_ARCS = [
+
+SEEDING_CONVERSATIONS: List[List[str]] = [
     [
-        "Hi, I need to return a defective Standard Widget.",
-        "I bought it 45 days ago. Is it still eligible for a refund?",
-        "What about a Gizmo I ordered but never received - it has been 3 weeks?",
-        "Can I get a refund instead of a replacement shipment?",
-        "How long will the refund take to show up on my card?",
+        "Can I return a defective Standard Widget after 45 days?",
+        "How long does a refund take?",
+        "What about an unopened Standard Widget?",
+        "Do I pay return shipping?",
+        "Is there a restocking fee?",
     ],
     [
-        "Does the Deluxe Sprocket come with a warranty?",
-        "What exactly does the warranty cover?",
-        "My Deluxe Sprocket stopped turning after 6 months - what should I do?",
-        "Do I have to pay for return shipping on a warranty claim?",
-        "How long do warranty repairs usually take?",
+        "What is the warranty on the Deluxe Sprocket?",
+        "What does the warranty cover?",
+        "Do warranty repairs cost anything?",
+        "How long do warranty repairs take?",
+        "Who pays return shipping for a warranty claim?",
     ],
     [
-        "What is the difference between a Standard Widget and a Compact Widget?",
-        "Is the Compact Widget compatible with the Sprocket Adapter v2?",
-        "What colors and sizes are Gizmos available in?",
-        "How much is a bundle of 10 Standard Widgets?",
-        "Do you carry any third-party accessories that fit the Sprocket Adapter v2?",
+        "How much is a Standard Widget?",
+        "Is there a bundle deal?",
+        "What is the Deluxe Sprocket price?",
+        "What products do you carry?",
+        "Do you sell accessories?",
     ],
 ]
 
+
 endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"]
-provided_agent_name = os.environ.get("FOUNDRY_AGENT_NAME", "").strip()
+model_deployment = os.environ["FOUNDRY_MODEL_NAME"]
 dataset_name = os.environ.get("DATASET_NAME", "traces-eval-sample")
 poll_interval_seconds = int(os.environ.get("POLL_INTERVAL_SECONDS", "10"))
-
-# Self-contained mode is enabled unless the user pointed at an existing agent.
-seed_traces = not provided_agent_name
-
-# Window default differs by mode: in self-contained mode we compute the
-# window exactly around the seeded traces (so this knob is ignored).
-traces_window_days = int(os.environ.get("FOUNDRY_TRACES_WINDOW_DAYS", "7"))
-
-# Seeding knobs (only used when seed_traces is True).
-trace_seeding_conversations = int(
-    os.environ.get("TRACE_SEEDING_CONVERSATIONS", str(len(SEEDING_CONVERSATION_ARCS)))
-)
-trace_seeding_turns = int(
-    os.environ.get("TRACE_SEEDING_TURNS", str(len(SEEDING_CONVERSATION_ARCS[0])))
-)
 trace_ingestion_wait_seconds = int(os.environ.get("TRACE_INGESTION_WAIT_SECONDS", "180"))
 
-if seed_traces and "FOUNDRY_MODEL_NAME" not in os.environ:
-    raise EnvironmentError(
-        "Self-contained mode requires FOUNDRY_MODEL_NAME (the Azure OpenAI deployment "
-        "name used to drive the temporary agent). Either set FOUNDRY_MODEL_NAME or set "
-        "FOUNDRY_AGENT_NAME to use an existing agent with traces."
-    )
-
-# Unique per-run output dataset name so repeated runs do not collide.
-# Output names are capped at 50 characters by the service.
+# Unique per-run id used for the output dataset name and the temporary
+# agent name so repeated runs do not collide and so any matched traces
+# clearly belong to this run. Output names are capped at 50 chars.
 run_id = f"{datetime.now(tz=timezone.utc).strftime('%y%m%d%H%M%S')}-{uuid.uuid4().hex[:4]}"
 output_dataset_name = f"{dataset_name}-{run_id}"
 if len(output_dataset_name) > 50:
     raise ValueError(
         f"Output dataset name `{output_dataset_name}` exceeds the 50-character service limit. "
-        f"Lower DATASET_NAME (currently `{dataset_name}`) so that `<DATASET_NAME>-<run id>` fits within 50 characters."
+        f"Shorten DATASET_NAME (currently `{dataset_name}`) so that `<name>-<run id>` fits within 50 characters."
     )
 
-# Agent name used to read traces. In self-contained mode we use a unique
-# per-run name so concurrent runs do not collide and so we know any matched
-# traces belong to this run.
-agent_name = provided_agent_name or f"traces-eval-sample-{run_id}"
+agent_name = f"traces-eval-sample-{run_id}"
 
 TERMINAL_STATUSES = {JobStatus.SUCCEEDED, JobStatus.FAILED, JobStatus.CANCELLED}
 
 
-def _safe_console(text: str) -> str:
-    """Encode `text` so it always prints on the active stdout encoding.
-
-    Some Windows consoles default to cp1252, which cannot encode characters
-    the model may emit (e.g. smart quotes, non-breaking hyphens). We replace
-    any unencodable code points with `?` so a preview line never crashes the
-    sample.
-    """
-    encoding = getattr(sys.stdout, "encoding", None) or "utf-8"
-    return text.encode(encoding, errors="replace").decode(encoding, errors="replace")
-
-
-def _seed_agent_traces(
-    project_client: AIProjectClient,
-    agent_name_to_use: str,
-    agent_id_to_use: str,
-    conversation_count: int,
-    turns_per_conversation: int,
-    conversation_ids: List[str],
-) -> None:
-    """Run a few conversations against the agent so GenAI spans flow to App Insights.
-
-    Created conversation IDs are appended to `conversation_ids` as each
-    conversation is created, so the caller can clean them up even if seeding
-    raises mid-way through.
-    """
-    arcs = SEEDING_CONVERSATION_ARCS
-    with project_client.get_openai_client() as openai_client:
-        for ci in range(conversation_count):
-            arc = arcs[ci % len(arcs)]
-            conversation = openai_client.conversations.create()
-            conversation_ids.append(conversation.id)
-            print(f"  - conversation {ci + 1}/{conversation_count} (id: {conversation.id})")
-            for ti in range(turns_per_conversation):
-                prompt = arc[ti % len(arc)]
-                response = openai_client.responses.create(
-                    conversation=conversation.id,
-                    input=prompt,
-                    extra_body={
-                        "agent_reference": {
-                            "name": agent_name_to_use,
-                            "id": agent_id_to_use,
-                            "type": "agent_reference",
-                        }
-                    },
-                )
-                preview = (response.output_text or "").replace("\n", " ")
-                if len(preview) > 80:
-                    preview = preview[:77] + "..."
-                print(_safe_console(f"      turn {ti + 1}: {prompt}"))
-                print(_safe_console(f"        response: {preview}"))
-
-
-mode_label = (
-    "self-contained (will create a temporary agent and seed traces)"
-    if seed_traces
-    else f"bring-your-own-agent (`{provided_agent_name}`)"
-)
-print(f"Mode: {mode_label}.")
+def _try_delete(label: str, fn: Callable[..., object], *args: object, **kwargs: object) -> None:
+    """Best-effort delete; logs and swallows failures so later cleanup steps still run."""
+    try:
+        fn(*args, **kwargs)
+        print(f"Deleted {label}.")
+    except Exception as exc:  # pylint: disable=broad-exception-caught
+        print(f"  (warning) could not delete {label}: {exc}")
+
 
 with (
     DefaultAzureCredential() as credential,
@@ -285,143 +164,103 @@ def _seed_agent_traces(
 
     created_agent = None
     conversation_ids: List[str] = []
-    seed_start: Optional[datetime] = None
     submitted_job_id: Optional[str] = None
     created_dataset: Optional[DatasetVersion] = None
 
     try:
-        if seed_traces:
-            # --------------------------------------------------------------
-            # 0a. Wire up Azure Monitor + GenAI instrumentation so calls to
-            #     responses.create emit semantic GenAI spans (with message
-            #     content) to Application Insights.
-            # --------------------------------------------------------------
-            try:
-                configure_azure_monitor = importlib.import_module(
-                    "azure.monitor.opentelemetry"
-                ).configure_azure_monitor
-                AIProjectInstrumentor = importlib.import_module(
-                    "azure.ai.projects.telemetry"
-                ).AIProjectInstrumentor
-            except ImportError as exc:
-                raise ImportError(
-                    "Self-contained mode requires the `azure-monitor-opentelemetry` and "
-                    "`azure-core-tracing-opentelemetry` packages. Install them with "
-                    "`pip install azure-monitor-opentelemetry azure-core-tracing-opentelemetry` "
-                    "or set FOUNDRY_AGENT_NAME to use an existing agent with traces."
-                ) from exc
-
-            # AIProjectInstrumentor requires this env var be set BEFORE
-            # instrument() is called. We force it on (not setdefault) so the
-            # temporary agent's calls always produce GenAI spans the data-gen
-            # service can read.
-            os.environ["AZURE_EXPERIMENTAL_ENABLE_GENAI_TRACING"] = "true"
-
-            print("Fetch Application Insights connection string and configure Azure Monitor exporter.")
-            connection_string = project_client.telemetry.get_application_insights_connection_string()
-            configure_azure_monitor(connection_string=connection_string)
-            AIProjectInstrumentor().instrument(enable_content_recording=True)
-
-            # --------------------------------------------------------------
-            # 0b. Create the temporary agent.
-            # --------------------------------------------------------------
-            model_deployment = os.environ["FOUNDRY_MODEL_NAME"]
-            print(f"Create temporary agent `{agent_name}` (model: `{model_deployment}`).")
-            created_agent = project_client.agents.create_version(
-                agent_name=agent_name,
-                definition=PromptAgentDefinition(
-                    model=model_deployment,
-                    instructions=AGENT_INSTRUCTIONS,
-                ),
-            )
-            print(
-                f"Agent created (id: {created_agent.id}, name: {created_agent.name}, "
-                f"version: {created_agent.version})."
-            )
+        # ------------------------------------------------------------------
+        # 1. Configure Azure Monitor + GenAI instrumentation so the
+        #    temporary agent's calls emit semantic GenAI spans (with
+        #    message content) to Application Insights.
+        # ------------------------------------------------------------------
+        # AIProjectInstrumentor reads this env var at instrument() time.
+        os.environ["AZURE_EXPERIMENTAL_ENABLE_GENAI_TRACING"] = "true"
 
-            # --------------------------------------------------------------
-            # 0c. Seed traces by running a few conversations against the agent.
-            # --------------------------------------------------------------
-            seed_start = datetime.now(tz=timezone.utc)
-            print(
-                f"Seed {trace_seeding_conversations} conversation(s) x "
-                f"{trace_seeding_turns} turn(s) against the agent so spans flow to Application Insights."
-            )
-            _seed_agent_traces(
-                project_client=project_client,
-                agent_name_to_use=created_agent.name,
-                agent_id_to_use=created_agent.id,
-                conversation_count=trace_seeding_conversations,
-                turns_per_conversation=trace_seeding_turns,
-                conversation_ids=conversation_ids,
-            )
+        print("Configure Azure Monitor exporter from the project's Application Insights connection.")
+        connection_string = project_client.telemetry.get_application_insights_connection_string()
+        configure_azure_monitor(connection_string=connection_string)
+        AIProjectInstrumentor().instrument(enable_content_recording=True)
 
-            # Flush any buffered spans so the only delay we wait for below is
-            # ingestion delay, not exporter batching delay.
-            try:
-                from opentelemetry import trace as _otel_trace  # pylint: disable=import-outside-toplevel
+        # ------------------------------------------------------------------
+        # 2. Create a temporary agent and seed traces by running a few
+        #    conversations against it.
+        # ------------------------------------------------------------------
+        print(f"Create temporary agent `{agent_name}` (model: `{model_deployment}`).")
+        created_agent = project_client.agents.create_version(
+            agent_name=agent_name,
+            definition=PromptAgentDefinition(model=model_deployment, instructions=AGENT_INSTRUCTIONS),
+        )
+        print(f"Agent created (id: {created_agent.id}, version: {created_agent.version}).")
 
-                tracer_provider = _otel_trace.get_tracer_provider()
-                force_flush = getattr(tracer_provider, "force_flush", None)
-                if callable(force_flush):
-                    force_flush()
-            except Exception as exc:  # pylint: disable=broad-exception-caught
-                print(f"  (warning) could not force-flush tracer provider: {exc}")
+        seed_start = datetime.now(tz=timezone.utc)
+        print(
+            f"Seed {len(SEEDING_CONVERSATIONS)} conversation(s) x "
+            f"{len(SEEDING_CONVERSATIONS[0])} turn(s) against the agent."
+        )
+        with project_client.get_openai_client() as openai_client:
+            for ci, arc in enumerate(SEEDING_CONVERSATIONS, start=1):
+                conversation = openai_client.conversations.create()
+                conversation_ids.append(conversation.id)
+                print(f"  - conversation {ci}/{len(SEEDING_CONVERSATIONS)} (id: {conversation.id})")
+                for prompt in arc:
+                    openai_client.responses.create(
+                        conversation=conversation.id,
+                        input=prompt,
+                        extra_body={
+                            "agent_reference": {
+                                "name": created_agent.name,
+                                "id": created_agent.id,
+                                "type": "agent_reference",
+                            }
+                        },
+                    )
 
-            print(
-                f"Wait {trace_ingestion_wait_seconds}s for Application Insights to ingest the "
-                f"emitted spans. Override with TRACE_INGESTION_WAIT_SECONDS.",
-                flush=True,
-            )
-            time.sleep(trace_ingestion_wait_seconds)
+        print(
+            f"Wait {trace_ingestion_wait_seconds}s for Application Insights to ingest the emitted spans. "
+            f"Override with TRACE_INGESTION_WAIT_SECONDS.",
+            flush=True,
+        )
+        time.sleep(trace_ingestion_wait_seconds)
 
         # ------------------------------------------------------------------
-        # 1. Submit a data generation job that reads agent traces.
+        # 3. Submit a data generation job that reads the agent's traces.
         # ------------------------------------------------------------------
-        if seed_traces and seed_start is not None:
-            # Window covers a small backoff before seeding through "now", which
-            # guarantees the seeded spans fall inside the queried window.
-            start_time = seed_start - timedelta(minutes=5)
-            end_time = datetime.now(tz=timezone.utc)
-        else:
-            # BYO mode: use the user-configurable look-back window.
-            end_time = datetime.now(tz=timezone.utc)
-            start_time = end_time - timedelta(days=traces_window_days)
+        # Cover a small backoff before seeding through "now" so the seeded
+        # spans definitely fall inside the queried window.
+        start_time = seed_start - timedelta(minutes=5)
+        end_time = datetime.now(tz=timezone.utc)
 
         print(
             f"Create a data generation job from traces for agent `{agent_name}` "
             f"(window: {start_time.isoformat()} .. {end_time.isoformat()})."
         )
-        job = DataGenerationJob(
-            inputs=DataGenerationJobInputs(
-                name=f"traces-eval-{run_id}",
-                scenario=DataGenerationJobScenario.EVALUATION,
-                sources=[
-                    TracesDataGenerationJobSource(
-                        description="Application Insights conversation traces for the Foundry agent.",
-                        agent_name=agent_name,
-                        start_time=start_time,
-                        end_time=end_time,
-                    ),
-                ],
-                options=TracesDataGenerationJobOptions(
+        job = project_client.beta.datasets.create_generation_job(
+            job=DataGenerationJob(
+                inputs=DataGenerationJobInputs(
+                    name=f"traces-eval-{run_id}",
+                    scenario=DataGenerationJobScenario.EVALUATION,
+                    sources=[
+                        TracesDataGenerationJobSource(
+                            description="Application Insights conversation traces for the temporary agent.",
+                            agent_name=agent_name,
+                            start_time=start_time,
+                            end_time=end_time,
+                        ),
+                    ],
                     # Service requires max_samples to be between 15 and 1000.
-                    max_samples=15,
+                    options=TracesDataGenerationJobOptions(max_samples=15),
+                    output_options=DataGenerationJobOutputOptions(name=output_dataset_name),
                 ),
-                output_options=DataGenerationJobOutputOptions(name=output_dataset_name),
             ),
         )
-        job = project_client.beta.datasets.create_generation_job(job=job)
         submitted_job_id = job.id
         print(f"Created data generation job `{job.id}` (status: `{job.status}`).")
 
         print(f"Poll job `{job.id}` until it reaches a terminal state.", end="", flush=True)
-        while True:
-            job = project_client.beta.datasets.get_generation_job(job_id=job.id)
-            if job.status in TERMINAL_STATUSES:
-                break
+        while job.status not in TERMINAL_STATUSES:
             time.sleep(poll_interval_seconds)
             print(".", end="", flush=True)
+            job = project_client.beta.datasets.get_generation_job(job_id=job.id)
         print()
         print(f"Final job status: `{job.status}`.")
 
@@ -429,68 +268,59 @@ def _seed_agent_traces(
             message = job.error.message if job.error is not None else "<no error message>"
             raise RuntimeError(f"Job `{job.id}` ended with status `{job.status}`: {message}")
 
-        # Locate the Dataset output produced by the job.
-        output_name: str = ""
-        output_version: str = ""
-        for output in (job.result.outputs if job.result is not None else None) or []:
-            if isinstance(output, DatasetDataGenerationJobOutput):
-                output_name = output.name or ""
-                output_version = output.version or ""
-                break
-        if not output_name or not output_version:
+        # ------------------------------------------------------------------
+        # 4. Resolve the generated dataset.
+        # ------------------------------------------------------------------
+        outputs = (job.result.outputs if job.result is not None else None) or []
+        dataset_output = next(
+            (o for o in outputs if isinstance(o, DatasetDataGenerationJobOutput)), None
+        )
+        if dataset_output is None or not dataset_output.name or not dataset_output.version:
             raise RuntimeError(f"Job `{job.id}` did not produce a dataset output.")
 
-        dataset: DatasetVersion = project_client.datasets.get(name=output_name, version=output_version)
-        created_dataset = dataset
-        print(f"Generated dataset: name=`{dataset.name}` version=`{dataset.version}` id=`{dataset.id}`")
+        created_dataset = project_client.datasets.get(
+            name=dataset_output.name, version=dataset_output.version
+        )
+        print(
+            f"Generated dataset: name=`{created_dataset.name}` "
+            f"version=`{created_dataset.version}` id=`{created_dataset.id}`"
+        )
         if job.result is not None and job.result.generated_samples is not None:
             print(f"Generated samples: {job.result.generated_samples}")
 
     finally:
-        # Best-effort cleanup. Each step is wrapped in its own try/except so a
-        # failure in one does not skip the others, and so cleanup never masks
-        # the real exception that brought us here. Order is outputs -> producers:
-        # dataset -> job -> seeded conversations -> temporary agent.
+        # Best-effort cleanup, outputs -> producers (dataset, job, conversations, agent).
         if created_dataset is not None:
-            try:
-                print(
-                    f"Delete the generated dataset `{created_dataset.name}` v{created_dataset.version}."
-                )
-                project_client.datasets.delete(
-                    name=created_dataset.name or "", version=created_dataset.version or ""
-                )
-            except Exception as exc:  # pylint: disable=broad-exception-caught
-                print(
-                    f"  (warning) could not delete generated dataset "
-                    f"`{created_dataset.name}` v{created_dataset.version}: {exc}"
-                )
+            _try_delete(
+                f"generated dataset `{created_dataset.name}` v{created_dataset.version}",
+                project_client.datasets.delete,
+                name=created_dataset.name or "",
+                version=created_dataset.version or "",
+            )
 
         if submitted_job_id is not None:
-            try:
-                print(f"Delete the data generation job `{submitted_job_id}`.")
-                project_client.beta.datasets.delete_generation_job(job_id=submitted_job_id)
-            except Exception as exc:  # pylint: disable=broad-exception-caught
-                print(
-                    f"  (warning) could not delete data generation job `{submitted_job_id}`: {exc}"
-                )
+            _try_delete(
+                f"data generation job `{submitted_job_id}`",
+                project_client.beta.datasets.delete_generation_job,
+                job_id=submitted_job_id,
+            )
 
         if conversation_ids:
             try:
                 with project_client.get_openai_client() as openai_client:
                     for cid in conversation_ids:
-                        try:
-                            openai_client.conversations.delete(conversation_id=cid)
-                            print(f"Deleted seeded conversation `{cid}`.")
-                        except Exception as exc:  # pylint: disable=broad-exception-caught
-                            print(f"  (warning) could not delete conversation `{cid}`: {exc}")
+                        _try_delete(
+                            f"seeded conversation `{cid}`",
+                            openai_client.conversations.delete,
+                            conversation_id=cid,
+                        )
             except Exception as exc:  # pylint: disable=broad-exception-caught
                 print(f"  (warning) could not open OpenAI client for conversation cleanup: {exc}")
 
         if created_agent is not None:
-            try:
-                project_client.agents.delete_version(
-                    agent_name=created_agent.name, agent_version=created_agent.version
-                )
-                print(f"Deleted temporary agent `{created_agent.name}` v{created_agent.version}.")
-            except Exception as exc:  # pylint: disable=broad-exception-caught
-                print(f"  (warning) could not delete temporary agent `{created_agent.name}`: {exc}")
+            _try_delete(
+                f"temporary agent `{created_agent.name}` v{created_agent.version}",
+                project_client.agents.delete_version,
+                agent_name=created_agent.name,
+                agent_version=created_agent.version,
+            )

From 6a255e8440051b927725ccae3784effad24e048a Mon Sep 17 00:00:00 2001
From: aprilkim <aprilk@microsoft.com>
Date: Sat, 30 May 2026 21:51:11 -0700
Subject: [PATCH 4/8] Tighten comments in self-contained traces sample

Trim verbose section banners and docstring prose; replace multi-line
comments with single-line equivalents. No behavior change.

282 -> 257 lines (-9%).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 ...et_generation_job_traces_for_evaluation.py | 73 ++++++-------------
 1 file changed, 24 insertions(+), 49 deletions(-)

diff --git a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py
index 8b5f06fd59b0..41e798cc3844 100644
--- a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py
+++ b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py
@@ -9,23 +9,16 @@
     Generates an evaluation dataset from an agent's conversation traces.
     The sample is fully self-contained:
 
-      1. Wires up Azure Monitor + the AIProjectInstrumentor so the temporary
-         agent's calls emit semantic GenAI spans (with message content) to
-         Application Insights.
-      2. Creates a temporary Foundry agent and runs a few sample
-         conversations against it so spans flow to Application Insights.
+      1. Wires up Azure Monitor + AIProjectInstrumentor so agent calls emit
+         semantic GenAI spans (with content) to Application Insights.
+      2. Creates a temporary agent and seeds spans with sample conversations.
       3. Waits for ingestion, then submits a `DataGenerationJob`
-         (scenario=EVALUATION, source=traces) that synthesizes question/
-         answer pairs from those spans.
-      4. Polls the job, fetches the resulting `DatasetVersion`, and prints
-         the count of generated samples.
-      5. Cleans up the dataset, job, seeded conversations, and the
-         temporary agent.
-
-    To run against an existing agent that already has recent traces in
-    Application Insights, replace the seeding block (step 2) with your
-    agent's name and skip the ingestion wait. The data-generation API call
-    (step 3) is the same.
+         (scenario=EVALUATION, source=traces) that synthesizes Q/A pairs.
+      4. Polls the job and fetches the resulting `DatasetVersion`.
+      5. Cleans up the dataset, job, seeded conversations, and agent.
+
+    To adapt for an existing agent with recent traces, replace step 2 with
+    your agent's name and skip the ingestion wait.
 
 USAGE:
     python sample_dataset_generation_job_traces_for_evaluation.py
@@ -40,15 +33,13 @@
        found in the overview page of your Microsoft Foundry project.
     2) FOUNDRY_MODEL_NAME - Required. The Azure OpenAI deployment name used
        to drive the temporary agent during trace seeding.
-    3) DATASET_NAME - Optional. Name to assign to the generated output
-       dataset. Defaults to `traces-eval-sample`. The service caps the
-       rendered output name at 50 characters, so keep custom values short -
-       the sample appends a unique run id suffix.
-    4) POLL_INTERVAL_SECONDS - Optional. Seconds to sleep between status
-       polls for the data generation job. Defaults to 10.
-    5) TRACE_INGESTION_WAIT_SECONDS - Optional. Seconds to wait after
-       seeding for Application Insights to ingest the emitted spans before
-       submitting the data generation job. Defaults to 180.
+    3) DATASET_NAME - Optional. Output dataset name. Defaults to
+       `traces-eval-sample`. Service caps the rendered name at 50 chars
+       (the sample appends a unique run-id suffix).
+    4) POLL_INTERVAL_SECONDS - Optional. Sleep between job status polls.
+       Defaults to 10.
+    5) TRACE_INGESTION_WAIT_SECONDS - Optional. Wait after seeding for
+       Application Insights ingestion. Defaults to 180.
 """
 
 import os
@@ -79,10 +70,7 @@
 load_dotenv()
 
 
-# Short persona used to make seeded traces look like real customer-support
-# conversations. The data-gen service synthesizes eval samples from these
-# traces, so the persona just needs enough domain detail to answer the
-# seeding prompts confidently.
+# Short persona; covers only the topics the seeded prompts ask about.
 AGENT_INSTRUCTIONS = """\
 You are the Widgets & Gizmos customer-support agent.
 
@@ -132,9 +120,7 @@
 poll_interval_seconds = int(os.environ.get("POLL_INTERVAL_SECONDS", "10"))
 trace_ingestion_wait_seconds = int(os.environ.get("TRACE_INGESTION_WAIT_SECONDS", "180"))
 
-# Unique per-run id used for the output dataset name and the temporary
-# agent name so repeated runs do not collide and so any matched traces
-# clearly belong to this run. Output names are capped at 50 chars.
+# Per-run id keeps repeated runs from colliding; output names are capped at 50 chars.
 run_id = f"{datetime.now(tz=timezone.utc).strftime('%y%m%d%H%M%S')}-{uuid.uuid4().hex[:4]}"
 output_dataset_name = f"{dataset_name}-{run_id}"
 if len(output_dataset_name) > 50:
@@ -168,12 +154,9 @@ def _try_delete(label: str, fn: Callable[..., object], *args: object, **kwargs:
     created_dataset: Optional[DatasetVersion] = None
 
     try:
-        # ------------------------------------------------------------------
-        # 1. Configure Azure Monitor + GenAI instrumentation so the
-        #    temporary agent's calls emit semantic GenAI spans (with
-        #    message content) to Application Insights.
-        # ------------------------------------------------------------------
-        # AIProjectInstrumentor reads this env var at instrument() time.
+        # 1. Configure Azure Monitor + GenAI instrumentation to emit spans with content.
+        # AIProjectInstrumentor gates on this env var at instrument() time; without it
+        # instrument() returns early and no spans flow.
         os.environ["AZURE_EXPERIMENTAL_ENABLE_GENAI_TRACING"] = "true"
 
         print("Configure Azure Monitor exporter from the project's Application Insights connection.")
@@ -181,10 +164,7 @@ def _try_delete(label: str, fn: Callable[..., object], *args: object, **kwargs:
         configure_azure_monitor(connection_string=connection_string)
         AIProjectInstrumentor().instrument(enable_content_recording=True)
 
-        # ------------------------------------------------------------------
-        # 2. Create a temporary agent and seed traces by running a few
-        #    conversations against it.
-        # ------------------------------------------------------------------
+        # 2. Create a temporary agent and seed traces.
         print(f"Create temporary agent `{agent_name}` (model: `{model_deployment}`).")
         created_agent = project_client.agents.create_version(
             agent_name=agent_name,
@@ -222,11 +202,8 @@ def _try_delete(label: str, fn: Callable[..., object], *args: object, **kwargs:
         )
         time.sleep(trace_ingestion_wait_seconds)
 
-        # ------------------------------------------------------------------
         # 3. Submit a data generation job that reads the agent's traces.
-        # ------------------------------------------------------------------
-        # Cover a small backoff before seeding through "now" so the seeded
-        # spans definitely fall inside the queried window.
+        # Small backoff so the seeded spans fall inside the queried window.
         start_time = seed_start - timedelta(minutes=5)
         end_time = datetime.now(tz=timezone.utc)
 
@@ -247,7 +224,7 @@ def _try_delete(label: str, fn: Callable[..., object], *args: object, **kwargs:
                             end_time=end_time,
                         ),
                     ],
-                    # Service requires max_samples to be between 15 and 1000.
+                    # Service requires max_samples in [15, 1000].
                     options=TracesDataGenerationJobOptions(max_samples=15),
                     output_options=DataGenerationJobOutputOptions(name=output_dataset_name),
                 ),
@@ -268,9 +245,7 @@ def _try_delete(label: str, fn: Callable[..., object], *args: object, **kwargs:
             message = job.error.message if job.error is not None else "<no error message>"
             raise RuntimeError(f"Job `{job.id}` ended with status `{job.status}`: {message}")
 
-        # ------------------------------------------------------------------
         # 4. Resolve the generated dataset.
-        # ------------------------------------------------------------------
         outputs = (job.result.outputs if job.result is not None else None) or []
         dataset_output = next(
             (o for o in outputs if isinstance(o, DatasetDataGenerationJobOutput)), None

From e321c41a1e5749d710d79574ad2216ea10475860 Mon Sep 17 00:00:00 2001
From: aprilkim <aprilk@microsoft.com>
Date: Sat, 30 May 2026 21:54:04 -0700
Subject: [PATCH 5/8] Inline cleanup try/except in self-contained traces sample

Replace the _try_delete helper with one inline try/except per resource.
Each cleanup now reads top-to-bottom at the call site (preferred for
sample readability) and drops a layer of indirection.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 ...et_generation_job_traces_for_evaluation.py | 59 +++++++++----------
 1 file changed, 27 insertions(+), 32 deletions(-)

diff --git a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py
index 41e798cc3844..a10257e2662f 100644
--- a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py
+++ b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py
@@ -46,7 +46,7 @@
 import time
 import uuid
 from datetime import datetime, timedelta, timezone
-from typing import Callable, List, Optional
+from typing import List, Optional
 
 from dotenv import load_dotenv
 
@@ -134,15 +134,6 @@
 TERMINAL_STATUSES = {JobStatus.SUCCEEDED, JobStatus.FAILED, JobStatus.CANCELLED}
 
 
-def _try_delete(label: str, fn: Callable[..., object], *args: object, **kwargs: object) -> None:
-    """Best-effort delete; logs and swallows failures so later cleanup steps still run."""
-    try:
-        fn(*args, **kwargs)
-        print(f"Deleted {label}.")
-    except Exception as exc:  # pylint: disable=broad-exception-caught
-        print(f"  (warning) could not delete {label}: {exc}")
-
-
 with (
     DefaultAzureCredential() as credential,
     AIProjectClient(endpoint=endpoint, credential=credential) as project_client,
@@ -266,36 +257,40 @@ def _try_delete(label: str, fn: Callable[..., object], *args: object, **kwargs:
     finally:
         # Best-effort cleanup, outputs -> producers (dataset, job, conversations, agent).
         if created_dataset is not None:
-            _try_delete(
-                f"generated dataset `{created_dataset.name}` v{created_dataset.version}",
-                project_client.datasets.delete,
-                name=created_dataset.name or "",
-                version=created_dataset.version or "",
-            )
+            try:
+                project_client.datasets.delete(
+                    name=created_dataset.name or "",
+                    version=created_dataset.version or "",
+                )
+                print(f"Deleted dataset `{created_dataset.name}` v{created_dataset.version}.")
+            except Exception as exc:  # pylint: disable=broad-exception-caught
+                print(f"  (warning) could not delete dataset: {exc}")
 
         if submitted_job_id is not None:
-            _try_delete(
-                f"data generation job `{submitted_job_id}`",
-                project_client.beta.datasets.delete_generation_job,
-                job_id=submitted_job_id,
-            )
+            try:
+                project_client.beta.datasets.delete_generation_job(job_id=submitted_job_id)
+                print(f"Deleted data generation job `{submitted_job_id}`.")
+            except Exception as exc:  # pylint: disable=broad-exception-caught
+                print(f"  (warning) could not delete job: {exc}")
 
         if conversation_ids:
             try:
                 with project_client.get_openai_client() as openai_client:
                     for cid in conversation_ids:
-                        _try_delete(
-                            f"seeded conversation `{cid}`",
-                            openai_client.conversations.delete,
-                            conversation_id=cid,
-                        )
+                        try:
+                            openai_client.conversations.delete(conversation_id=cid)
+                            print(f"Deleted seeded conversation `{cid}`.")
+                        except Exception as exc:  # pylint: disable=broad-exception-caught
+                            print(f"  (warning) could not delete conversation `{cid}`: {exc}")
             except Exception as exc:  # pylint: disable=broad-exception-caught
                 print(f"  (warning) could not open OpenAI client for conversation cleanup: {exc}")
 
         if created_agent is not None:
-            _try_delete(
-                f"temporary agent `{created_agent.name}` v{created_agent.version}",
-                project_client.agents.delete_version,
-                agent_name=created_agent.name,
-                agent_version=created_agent.version,
-            )
+            try:
+                project_client.agents.delete_version(
+                    agent_name=created_agent.name,
+                    agent_version=created_agent.version,
+                )
+                print(f"Deleted temporary agent `{created_agent.name}` v{created_agent.version}.")
+            except Exception as exc:  # pylint: disable=broad-exception-caught
+                print(f"  (warning) could not delete agent: {exc}")

From 7cad865defb518b16dcfb5e7cc0542445a8b8975 Mon Sep 17 00:00:00 2001
From: aprilkim <aprilk@microsoft.com>
Date: Sat, 30 May 2026 22:27:24 -0700
Subject: [PATCH 6/8] Compress persona and replace conversation literal with a
 loop

Persona collapsed to 4 inline declarations (14 -> 5 lines). The nested
SEEDING_CONVERSATIONS list-of-lists is replaced by a flat SEED_PROMPTS
list plus NUM_CONVERSATIONS constant; the seeding loop cycles each
conversation through the same prompts. Behavior unchanged - still 3 x 5
= 15 turns and 15 generated samples (live-tested).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 ...et_generation_job_traces_for_evaluation.py | 58 +++++--------------
 1 file changed, 15 insertions(+), 43 deletions(-)

diff --git a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py
index a10257e2662f..228e5e1d9e25 100644
--- a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py
+++ b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py
@@ -72,46 +72,21 @@
 
 # Short persona; covers only the topics the seeded prompts ask about.
 AGENT_INSTRUCTIONS = """\
-You are the Widgets & Gizmos customer-support agent.
-
-Returns: Unopened products may be returned within 30 days for a full refund.
-Defective products may be returned within 90 days at no cost. Refunds take
-5-7 business days.
-
-Warranty: Standard products carry a 1-year limited warranty. The Deluxe
-Sprocket carries a 5-year warranty. Warranty repairs are free; we cover
-return shipping. Repairs take 10-14 business days.
-
-Products: Standard Widget is $19.99 (bundle of 10 for $149.99). Deluxe
-Sprocket is $79.99.
-
-If you do not know the answer, say so. Be concise.
+Widgets & Gizmos support agent. Be concise. Say so if unsure.
+Returns: unopened 30 days full refund; defective 90 days free; refunds 5-7 business days.
+Warranty: Standard 1 year, Deluxe Sprocket 5 years; repairs free, we pay shipping, 10-14 days.
+Products: Standard Widget $19.99 (10-pack $149.99); Deluxe Sprocket $79.99.
 """
 
 
-SEEDING_CONVERSATIONS: List[List[str]] = [
-    [
-        "Can I return a defective Standard Widget after 45 days?",
-        "How long does a refund take?",
-        "What about an unopened Standard Widget?",
-        "Do I pay return shipping?",
-        "Is there a restocking fee?",
-    ],
-    [
-        "What is the warranty on the Deluxe Sprocket?",
-        "What does the warranty cover?",
-        "Do warranty repairs cost anything?",
-        "How long do warranty repairs take?",
-        "Who pays return shipping for a warranty claim?",
-    ],
-    [
-        "How much is a Standard Widget?",
-        "Is there a bundle deal?",
-        "What is the Deluxe Sprocket price?",
-        "What products do you carry?",
-        "Do you sell accessories?",
-    ],
+SEED_PROMPTS = [
+    "Refund policy?",
+    "Warranty length?",
+    "Standard Widget price?",
+    "Any bundle deal?",
+    "Who pays shipping for warranty repairs?",
 ]
+NUM_CONVERSATIONS = 3  # NUM_CONVERSATIONS * len(SEED_PROMPTS) must be >= max_samples (15).
 
 
 endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"]
@@ -164,16 +139,13 @@
         print(f"Agent created (id: {created_agent.id}, version: {created_agent.version}).")
 
         seed_start = datetime.now(tz=timezone.utc)
-        print(
-            f"Seed {len(SEEDING_CONVERSATIONS)} conversation(s) x "
-            f"{len(SEEDING_CONVERSATIONS[0])} turn(s) against the agent."
-        )
+        print(f"Seed {NUM_CONVERSATIONS} conversation(s) x {len(SEED_PROMPTS)} turn(s) against the agent.")
         with project_client.get_openai_client() as openai_client:
-            for ci, arc in enumerate(SEEDING_CONVERSATIONS, start=1):
+            for ci in range(1, NUM_CONVERSATIONS + 1):
                 conversation = openai_client.conversations.create()
                 conversation_ids.append(conversation.id)
-                print(f"  - conversation {ci}/{len(SEEDING_CONVERSATIONS)} (id: {conversation.id})")
-                for prompt in arc:
+                print(f"  - conversation {ci}/{NUM_CONVERSATIONS} (id: {conversation.id})")
+                for prompt in SEED_PROMPTS:
                     openai_client.responses.create(
                         conversation=conversation.id,
                         input=prompt,

From d64140571c68992fa9e5e4705de6a92089d3dd3f Mon Sep 17 00:00:00 2001
From: aprilkim <aprilk@microsoft.com>
Date: Sun, 31 May 2026 09:56:05 -0700
Subject: [PATCH 7/8] Minimize self-contained traces sample to 1 seeded turn

max_samples is a cap on generated samples, not a floor on input traces.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 ...et_generation_job_traces_for_evaluation.py | 75 ++++++++-----------
 1 file changed, 30 insertions(+), 45 deletions(-)

diff --git a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py
index 228e5e1d9e25..b61f37577f8c 100644
--- a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py
+++ b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py
@@ -46,7 +46,7 @@
 import time
 import uuid
 from datetime import datetime, timedelta, timezone
-from typing import List, Optional
+from typing import Optional
 
 from dotenv import load_dotenv
 
@@ -70,23 +70,13 @@
 load_dotenv()
 
 
-# Short persona; covers only the topics the seeded prompts ask about.
-AGENT_INSTRUCTIONS = """\
-Widgets & Gizmos support agent. Be concise. Say so if unsure.
-Returns: unopened 30 days full refund; defective 90 days free; refunds 5-7 business days.
-Warranty: Standard 1 year, Deluxe Sprocket 5 years; repairs free, we pay shipping, 10-14 days.
-Products: Standard Widget $19.99 (10-pack $149.99); Deluxe Sprocket $79.99.
-"""
-
-
-SEED_PROMPTS = [
-    "Refund policy?",
-    "Warranty length?",
-    "Standard Widget price?",
-    "Any bundle deal?",
-    "Who pays shipping for warranty repairs?",
-]
-NUM_CONVERSATIONS = 3  # NUM_CONVERSATIONS * len(SEED_PROMPTS) must be >= max_samples (15).
+# Minimal persona + prompt; one seeded turn is enough for the job to succeed
+# (max_samples is the cap on generated samples, not a floor on input traces).
+AGENT_INSTRUCTIONS = (
+    "Widgets & Gizmos support agent. Be concise. "
+    "Refunds: unopened 30 days; defective 90 days; 5-7 business days to process."
+)
+SEED_PROMPT = "What is your refund policy?"
 
 
 endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"]
@@ -115,7 +105,7 @@
 ):
 
     created_agent = None
-    conversation_ids: List[str] = []
+    created_conversation_id: Optional[str] = None
     submitted_job_id: Optional[str] = None
     created_dataset: Optional[DatasetVersion] = None
 
@@ -139,24 +129,22 @@
         print(f"Agent created (id: {created_agent.id}, version: {created_agent.version}).")
 
         seed_start = datetime.now(tz=timezone.utc)
-        print(f"Seed {NUM_CONVERSATIONS} conversation(s) x {len(SEED_PROMPTS)} turn(s) against the agent.")
+        print(f"Seed one conversation against the agent (prompt: {SEED_PROMPT!r}).")
         with project_client.get_openai_client() as openai_client:
-            for ci in range(1, NUM_CONVERSATIONS + 1):
-                conversation = openai_client.conversations.create()
-                conversation_ids.append(conversation.id)
-                print(f"  - conversation {ci}/{NUM_CONVERSATIONS} (id: {conversation.id})")
-                for prompt in SEED_PROMPTS:
-                    openai_client.responses.create(
-                        conversation=conversation.id,
-                        input=prompt,
-                        extra_body={
-                            "agent_reference": {
-                                "name": created_agent.name,
-                                "id": created_agent.id,
-                                "type": "agent_reference",
-                            }
-                        },
-                    )
+            conversation = openai_client.conversations.create()
+            created_conversation_id = conversation.id
+            print(f"  - conversation id: {conversation.id}")
+            openai_client.responses.create(
+                conversation=conversation.id,
+                input=SEED_PROMPT,
+                extra_body={
+                    "agent_reference": {
+                        "name": created_agent.name,
+                        "id": created_agent.id,
+                        "type": "agent_reference",
+                    }
+                },
+            )
 
         print(
             f"Wait {trace_ingestion_wait_seconds}s for Application Insights to ingest the emitted spans. "
@@ -187,7 +175,8 @@
                             end_time=end_time,
                         ),
                     ],
-                    # Service requires max_samples in [15, 1000].
+                    # Service requires max_samples in [15, 1000]. It's a cap on
+                    # generated samples - one seeded trace turn is enough.
                     options=TracesDataGenerationJobOptions(max_samples=15),
                     output_options=DataGenerationJobOutputOptions(name=output_dataset_name),
                 ),
@@ -245,17 +234,13 @@
             except Exception as exc:  # pylint: disable=broad-exception-caught
                 print(f"  (warning) could not delete job: {exc}")
 
-        if conversation_ids:
+        if created_conversation_id is not None:
             try:
                 with project_client.get_openai_client() as openai_client:
-                    for cid in conversation_ids:
-                        try:
-                            openai_client.conversations.delete(conversation_id=cid)
-                            print(f"Deleted seeded conversation `{cid}`.")
-                        except Exception as exc:  # pylint: disable=broad-exception-caught
-                            print(f"  (warning) could not delete conversation `{cid}`: {exc}")
+                    openai_client.conversations.delete(conversation_id=created_conversation_id)
+                    print(f"Deleted seeded conversation `{created_conversation_id}`.")
             except Exception as exc:  # pylint: disable=broad-exception-caught
-                print(f"  (warning) could not open OpenAI client for conversation cleanup: {exc}")
+                print(f"  (warning) could not delete conversation: {exc}")
 
         if created_agent is not None:
             try:

From 416fdd8cb3a881f1544b15358ba856202928ddb7 Mon Sep 17 00:00:00 2001
From: aprilkim <aprilk@microsoft.com>
Date: Sun, 31 May 2026 10:40:29 -0700
Subject: [PATCH 8/8] Drop client instrumentation, 'temporary' wording,
 optional env vars

Prompt agents emit server-side traces to the project's connected App Insights, so client-side AIProjectInstrumentor + configure_azure_monitor are not required. Hardcode poll/wait constants and dataset name (still uniqueified via run id). Verified live: PASS in 231s, 1 sample generated, clean teardown.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 ...et_generation_job_traces_for_evaluation.py | 77 ++++++-------------
 1 file changed, 24 insertions(+), 53 deletions(-)

diff --git a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py
index b61f37577f8c..498a2e9bdca2 100644
--- a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py
+++ b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_traces_for_evaluation.py
@@ -9,15 +9,16 @@
     Generates an evaluation dataset from an agent's conversation traces.
     The sample is fully self-contained:
 
-      1. Wires up Azure Monitor + AIProjectInstrumentor so agent calls emit
-         semantic GenAI spans (with content) to Application Insights.
-      2. Creates a temporary agent and seeds spans with sample conversations.
-      3. Waits for ingestion, then submits a `DataGenerationJob`
+      1. Creates an agent and seeds spans with a sample conversation.
+      2. Waits for ingestion, then submits a `DataGenerationJob`
          (scenario=EVALUATION, source=traces) that synthesizes Q/A pairs.
-      4. Polls the job and fetches the resulting `DatasetVersion`.
-      5. Cleans up the dataset, job, seeded conversations, and agent.
+      3. Polls the job and fetches the resulting `DatasetVersion`.
+      4. Cleans up the dataset, job, seeded conversations, and agent.
 
-    To adapt for an existing agent with recent traces, replace step 2 with
+    Prerequisite: the project must have an Application Insights resource
+    connected so the agent emits server-side traces.
+
+    To adapt for an existing agent with recent traces, replace step 1 with
     your agent's name and skip the ingestion wait.
 
 USAGE:
@@ -25,21 +26,13 @@
 
     Before running the sample:
 
-    pip install "azure-ai-projects>=2.2.0" azure-identity python-dotenv \\
-        azure-monitor-opentelemetry azure-core-tracing-opentelemetry
+    pip install "azure-ai-projects>=2.2.0" azure-identity python-dotenv
 
     Set these environment variables with your own values:
     1) FOUNDRY_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as
        found in the overview page of your Microsoft Foundry project.
     2) FOUNDRY_MODEL_NAME - Required. The Azure OpenAI deployment name used
-       to drive the temporary agent during trace seeding.
-    3) DATASET_NAME - Optional. Output dataset name. Defaults to
-       `traces-eval-sample`. Service caps the rendered name at 50 chars
-       (the sample appends a unique run-id suffix).
-    4) POLL_INTERVAL_SECONDS - Optional. Sleep between job status polls.
-       Defaults to 10.
-    5) TRACE_INGESTION_WAIT_SECONDS - Optional. Wait after seeding for
-       Application Insights ingestion. Defaults to 180.
+       to drive the agent during trace seeding.
 """
 
 import os
@@ -64,8 +57,6 @@
     TracesDataGenerationJobOptions,
     TracesDataGenerationJobSource,
 )
-from azure.ai.projects.telemetry import AIProjectInstrumentor
-from azure.monitor.opentelemetry import configure_azure_monitor
 
 load_dotenv()
 
@@ -81,20 +72,14 @@
 
 endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"]
 model_deployment = os.environ["FOUNDRY_MODEL_NAME"]
-dataset_name = os.environ.get("DATASET_NAME", "traces-eval-sample")
-poll_interval_seconds = int(os.environ.get("POLL_INTERVAL_SECONDS", "10"))
-trace_ingestion_wait_seconds = int(os.environ.get("TRACE_INGESTION_WAIT_SECONDS", "180"))
+DATASET_NAME = "traces-eval-sample"
+POLL_INTERVAL_SECONDS = 10
+TRACE_INGESTION_WAIT_SECONDS = 180
 
 # Per-run id keeps repeated runs from colliding; output names are capped at 50 chars.
 run_id = f"{datetime.now(tz=timezone.utc).strftime('%y%m%d%H%M%S')}-{uuid.uuid4().hex[:4]}"
-output_dataset_name = f"{dataset_name}-{run_id}"
-if len(output_dataset_name) > 50:
-    raise ValueError(
-        f"Output dataset name `{output_dataset_name}` exceeds the 50-character service limit. "
-        f"Shorten DATASET_NAME (currently `{dataset_name}`) so that `<name>-<run id>` fits within 50 characters."
-    )
-
-agent_name = f"traces-eval-sample-{run_id}"
+output_dataset_name = f"{DATASET_NAME}-{run_id}"
+agent_name = f"{DATASET_NAME}-{run_id}"
 
 TERMINAL_STATUSES = {JobStatus.SUCCEEDED, JobStatus.FAILED, JobStatus.CANCELLED}
 
@@ -110,18 +95,8 @@
     created_dataset: Optional[DatasetVersion] = None
 
     try:
-        # 1. Configure Azure Monitor + GenAI instrumentation to emit spans with content.
-        # AIProjectInstrumentor gates on this env var at instrument() time; without it
-        # instrument() returns early and no spans flow.
-        os.environ["AZURE_EXPERIMENTAL_ENABLE_GENAI_TRACING"] = "true"
-
-        print("Configure Azure Monitor exporter from the project's Application Insights connection.")
-        connection_string = project_client.telemetry.get_application_insights_connection_string()
-        configure_azure_monitor(connection_string=connection_string)
-        AIProjectInstrumentor().instrument(enable_content_recording=True)
-
-        # 2. Create a temporary agent and seed traces.
-        print(f"Create temporary agent `{agent_name}` (model: `{model_deployment}`).")
+        # 1. Create an agent and seed traces.
+        print(f"Create agent `{agent_name}` (model: `{model_deployment}`).")
         created_agent = project_client.agents.create_version(
             agent_name=agent_name,
             definition=PromptAgentDefinition(model=model_deployment, instructions=AGENT_INSTRUCTIONS),
@@ -146,14 +121,10 @@
                 },
             )
 
-        print(
-            f"Wait {trace_ingestion_wait_seconds}s for Application Insights to ingest the emitted spans. "
-            f"Override with TRACE_INGESTION_WAIT_SECONDS.",
-            flush=True,
-        )
-        time.sleep(trace_ingestion_wait_seconds)
+        print(f"Wait {TRACE_INGESTION_WAIT_SECONDS}s for Application Insights to ingest the spans.", flush=True)
+        time.sleep(TRACE_INGESTION_WAIT_SECONDS)
 
-        # 3. Submit a data generation job that reads the agent's traces.
+        # 2. Submit a data generation job that reads the agent's traces.
         # Small backoff so the seeded spans fall inside the queried window.
         start_time = seed_start - timedelta(minutes=5)
         end_time = datetime.now(tz=timezone.utc)
@@ -169,7 +140,7 @@
                     scenario=DataGenerationJobScenario.EVALUATION,
                     sources=[
                         TracesDataGenerationJobSource(
-                            description="Application Insights conversation traces for the temporary agent.",
+                            description="Application Insights conversation traces for the agent.",
                             agent_name=agent_name,
                             start_time=start_time,
                             end_time=end_time,
@@ -187,7 +158,7 @@
 
         print(f"Poll job `{job.id}` until it reaches a terminal state.", end="", flush=True)
         while job.status not in TERMINAL_STATUSES:
-            time.sleep(poll_interval_seconds)
+            time.sleep(POLL_INTERVAL_SECONDS)
             print(".", end="", flush=True)
             job = project_client.beta.datasets.get_generation_job(job_id=job.id)
         print()
@@ -197,7 +168,7 @@
             message = job.error.message if job.error is not None else "<no error message>"
             raise RuntimeError(f"Job `{job.id}` ended with status `{job.status}`: {message}")
 
-        # 4. Resolve the generated dataset.
+        # 3. Resolve the generated dataset.
         outputs = (job.result.outputs if job.result is not None else None) or []
         dataset_output = next(
             (o for o in outputs if isinstance(o, DatasetDataGenerationJobOutput)), None
@@ -248,6 +219,6 @@
                     agent_name=created_agent.name,
                     agent_version=created_agent.version,
                 )
-                print(f"Deleted temporary agent `{created_agent.name}` v{created_agent.version}.")
+                print(f"Deleted agent `{created_agent.name}` v{created_agent.version}.")
             except Exception as exc:  # pylint: disable=broad-exception-caught
                 print(f"  (warning) could not delete agent: {exc}")