From ac855eb584b5fe5e7e4335c233a057304c1d0ffd Mon Sep 17 00:00:00 2001 From: aprilkim Date: Sat, 30 May 2026 19:18:50 -0700 Subject: [PATCH] [ai/azure-ai-projects] fix sample print crashes on cp1252; expand finetuning seed; make simulation dataset always-fresh Three small fixes uncovered while bug-bashing the 2.2.0 samples against a live Foundry project: 1. Unicode print crashes on Windows cp1252 (16 sample files, 36 print lines). Final summary prints used U+2713/U+2717/U+2705/U+274C glyphs which crash the script when stdout is the default Windows code page (cp1252) and the user has not exported PYTHONIOENCODING=utf-8. Replace the glyphs with ASCII [OK] / [FAIL] tokens so the samples succeed out of the box on a fresh Windows shell. 2. Finetuning seed document was too small for SUPERVISED_FINETUNING QnA generation. The 1.2 KB embedded reference doc passed the eval scenario but the finetuning scenario rejected it with "File content lacks sufficient context to generate quality questions." Expanded the seed to a ~10 KB widgets/gizmos/sprockets reference (same domain, much richer prose) which lets the service synthesize the requested 15 QnA pairs. Also updated the surrounding comment to explain the size requirement. 3. Simulation sample silently fell back to a stale cached dataset on re-runs. The previous code uploaded simulation-scenarios:v1 and, on the inevitable "already exists" failure for the second run, swallowed the exception and pointed the run at the cached server-side dataset. That made the sample non-reproducible: anyone editing the local JSONL would not see their changes. Now the dataset name is suffixed with a per-run id so every invocation uploads fresh data, and the "expected conversation count" line is computed from the actual JSONL row count rather than a hard-coded constant. Verified locally: all modified files compile and a live end-to-end run of sample_multiturn_conversation_evaluation.py prints "[OK] Evaluation run completed successfully!" cleanly on a default Windows shell. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ...generation_job_simpleqna_for_finetuning.py | 246 ++++++++++++++++-- .../evaluations/sample_agent_evaluation.py | 4 +- .../sample_agent_response_evaluation.py | 4 +- ..._response_evaluation_with_function_tool.py | 4 +- ...ple_agent_trace_evaluation_smart_filter.py | 4 +- .../sample_evaluation_cluster_insight.py | 8 +- .../sample_evaluation_compare_insight.py | 6 +- .../evaluations/sample_model_evaluation.py | 4 +- .../sample_model_evaluation_instant_model.py | 4 +- ...ample_multiturn_conversation_evaluation.py | 4 +- ...ample_multiturn_conversation_simulation.py | 44 ++-- ...multiturn_trace_evaluation_agent_filter.py | 4 +- ...sample_multiturn_trace_evaluation_by_id.py | 4 +- ...ed_agent_traces_evaluation_smart_filter.py | 4 +- .../sample_scheduled_evaluations.py | 4 +- .../sample_synthetic_data_agent_evaluation.py | 4 +- .../sample_synthetic_data_model_evaluation.py | 4 +- 17 files changed, 278 insertions(+), 78 deletions(-) diff --git a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_simpleqna_for_finetuning.py b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_simpleqna_for_finetuning.py index 8a545c3dd6b3..5aaca24250c7 100644 --- a/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_simpleqna_for_finetuning.py +++ b/sdk/ai/azure-ai-projects/samples/datasets/sample_dataset_generation_job_simpleqna_for_finetuning.py @@ -83,32 +83,226 @@ f"Lower DATASET_NAME (currently `{dataset_name}`) so that `-` fits within 50 characters." ) -# Reference document the sample uploads as an Azure OpenAI file. The service -# requires the file to contain at least 1 KB of content to generate QnA from. -SEED_REFERENCE_DOCUMENT = """# Widgets and Gizmos Reference - -## Products -- Widget: blue, manufactured at Factory 7 in Acme, carbon-fiber, rated to 80 C, sold in packs of 4, 250 g each. -- Gizmo: red, manufactured at Factory 12 in Bedrock, carbon-fiber, rated to 80 C, sold individually, 1.2 kg each. -- Sprocket: green, manufactured at Factory 3 in Acme, stainless steel, rated to 200 C, sold individually, 500 g each. - -## Operations -- Factory operates weekdays 0700-1900 local time. -- Closed on public holidays, except for the annual maintenance run on December 27. -- ISO 9001 certified; audited annually by an independent third party. -- Quality control samples every 100th unit and runs full destructive testing on every 5000th unit. - -## Customer support -- Warranty claims: email support@example.com with the serial number printed on the underside of the product. -- Returns: accepted within 30 days if unopened; opened items are eligible for repair only. -- Bulk orders (50+ units): contact sales@example.com for volume pricing and an extended 90-day return window. -- Replacement parts: orderable directly from the support portal using the original order number. - -## Pricing and SLAs -- Widget pack: USD 24.99 per 4-pack; free shipping on orders over USD 75. -- Gizmo unit: USD 49.99; free shipping on orders over USD 75. -- Sprocket unit: USD 14.99; ships from regional warehouses in 1-2 business days. -- Standard support response: within one business day. Priority support response: within four hours. +# Reference document the sample uploads as an Azure OpenAI file. SUPERVISED_FINETUNING +# QnA generation requires a substantially richer corpus than the eval scenario does; +# a 1-2 KB summary is not enough and the service will reject it with +# "File content lacks sufficient context to generate quality questions." Keep this +# block at roughly 8-12 KB of varied prose so the service has enough material to +# synthesize diverse question/answer pairs. +SEED_REFERENCE_DOCUMENT = """# Widgets, Gizmos, and Sprockets: Complete Product and Operations Reference + +## 1. Product Catalog + +### 1.1 Widget (model WDG-100) +The Widget is a structural carbon-fiber component manufactured at Factory 7 in Acme. It is +finished in matte blue (Pantone 2935 C) using a UV-stable powder coating. Each unit measures +120 mm x 40 mm x 18 mm and weighs 250 g (+/- 5 g). Widgets are rated for continuous service +up to 80 degrees Celsius and a transient peak of 95 degrees Celsius for up to 60 seconds. +Widgets ship in packs of 4, packaged in recyclable cardboard with biodegradable foam inserts. +The serial number is laser-etched on the underside in the format WDG-100-YYWW-NNNNN, where +YY is the two-digit year, WW is the ISO week, and NNNNN is the per-week sequence number. + +Compatible mounting hardware: M5 stainless steel bolts, torqued to 6.0 Nm. Substituting +non-stainless bolts voids the corrosion portion of the warranty. + +### 1.2 Gizmo (model GZM-200) +The Gizmo is a precision carbon-fiber assembly manufactured at Factory 12 in Bedrock. It is +finished in gloss red (Pantone 186 C). Each Gizmo measures 220 mm x 110 mm x 60 mm and +weighs 1.2 kg (+/- 20 g). Gizmos are sold individually and are rated to 80 degrees Celsius. +They include an integrated thermal cutoff that disables the unit at 88 degrees Celsius and +re-enables it after a five minute cool-down. The serial number is engraved on the side and +follows the format GZM-200-YYWW-NNNNN. + +Compatible mounting hardware: M8 stainless steel bolts, torqued to 18 Nm. Gizmos should be +installed on a flat surface with no more than 0.5 mm of warp across the 220 mm dimension. + +### 1.3 Sprocket (model SPR-300) +The Sprocket is a stainless-steel rotating component manufactured at Factory 3 in Acme. +It is finished in anodised green and weighs 500 g. The Sprocket is rated for continuous +service up to 200 degrees Celsius. The teeth count is 24, the pitch diameter is 60 mm, +and the bore is 12 mm with a standard 4 mm keyway. Sprockets ship individually with a +laser-etched serial number on the hub in the format SPR-300-YYWW-NNNNN. + +Compatible mounting hardware: M12 stainless steel set screws, torqued to 22 Nm. + +### 1.4 Compatibility matrix +* Widget + Gizmo: fully compatible, no adapter required. +* Widget + Sprocket: requires the WDG-SPR adapter plate (part WDG-SPR-A01). +* Gizmo + Sprocket: requires the GZM-SPR adapter plate (part GZM-SPR-A02) and a 4 mm shim. +* Widget + Gizmo + Sprocket (three-way stack): requires both adapter plates and the + triple-stack bracket WGS-T01. Torque all bolts to spec in the sequence Widget, + Gizmo, Sprocket. + +## 2. Manufacturing and Operations + +### 2.1 Factory schedule +All three factories operate weekdays from 0700 to 1900 local time. Factories are closed +on national public holidays except for the annual maintenance run on December 27, when +each factory performs cleaning, lubrication, and recalibration of CNC equipment and +finishing lines. The maintenance run runs from 0600 to 1400 local time and does not +produce shippable inventory. + +### 2.2 Quality control +Every factory is ISO 9001:2015 certified and is audited annually by an independent +third party. Quality control samples every 100th unit for visual and dimensional +inspection. Every 5000th unit undergoes full destructive testing including tensile, +compressive, and thermal cycling. Destructive test results are archived for seven +years and are available to enterprise customers on request. + +### 2.3 Lot traceability +The first four characters of every serial number identify the model, the next four +characters identify the ISO year and week, and the remaining five characters identify +the per-week sequence number. Given any serial number, customer support can identify +the production line, the shift, the operator, and the raw material lot that produced +the unit. Lot traceability records are retained for the life of the product plus three +years. + +### 2.4 Environmental +All three factories are powered by a mix of on-site solar and grid-tied wind generation. +Total Scope 1 and Scope 2 emissions for FY2025 were 12,400 tonnes CO2e, a 14 percent +reduction from FY2024. Packaging is fully recyclable; the cardboard boxes are made from +80 percent post-consumer recycled fibre and the biodegradable foam is corn-starch based. + +## 3. Pricing and Ordering + +### 3.1 Standard list prices +* Widget 4-pack (WDG-100-PK4): USD 24.99 +* Gizmo single (GZM-200): USD 49.99 +* Sprocket single (SPR-300): USD 14.99 +* WDG-SPR adapter plate: USD 6.50 +* GZM-SPR adapter plate: USD 7.50 +* Triple-stack bracket WGS-T01: USD 18.00 + +### 3.2 Shipping +Free standard shipping is provided on orders over USD 75 within the United States and +Canada. International orders incur shipping based on weight and destination, computed +at checkout. Standard transit time within North America is 3 to 5 business days. Express +overnight shipping is available for an additional USD 18 per shipment. + +### 3.3 Bulk orders +Bulk orders of 50 or more units of any single product receive a 12 percent discount +on the list price plus a 90 day return window. Bulk orders of 250 or more units +receive an 18 percent discount and the option of a dedicated account manager. Contact +sales@example.com for bulk orders. + +### 3.4 Payment terms +Standard payment is due at checkout via credit card or PayPal. Enterprise customers +with an approved purchase order may pay net 30 days from invoice date. Late payments +incur a 1.5 percent monthly service charge. + +## 4. Warranty and Returns + +### 4.1 Standard warranty +All products carry a two year limited warranty against defects in materials and +workmanship from the date of purchase. The warranty does not cover damage from +incorrect installation, exposure beyond the rated temperature range, modification, +or normal wear. Warranty service is provided by repair, replacement, or refund at +the manufacturer's discretion. + +### 4.2 Filing a warranty claim +Warranty claims are filed by emailing support@example.com with the product serial +number, a description of the issue, and photographs of the failure mode. Acme will +respond within one business day with either a Return Merchandise Authorisation (RMA) +number or a request for additional information. RMAs are valid for 30 days and must +be referenced on the outside of any returned package. + +### 4.3 Returns +Unopened products can be returned within 30 days of receipt for a full refund. +Opened products are eligible for repair only, except where required by local law. +Bulk orders (50+ units) are eligible for return within 90 days under the same +unopened/opened rules. Custom-finished products are non-returnable. + +### 4.4 Repair turnaround +The target turnaround for in-warranty repair is 10 business days from receipt at the +service centre. Out-of-warranty repair is offered at a fixed rate of USD 35 per +Widget, USD 60 per Gizmo, or USD 20 per Sprocket, plus return shipping. + +## 5. Installation and Use + +### 5.1 Pre-installation checks +Before installing any product, inspect for transit damage. If the box shows signs of +crushing or moisture, photograph the damage before opening and report it to +support@example.com within 48 hours. Confirm that the serial number on the unit +matches the packing slip. + +### 5.2 Widget installation +Mount Widgets on a flat surface with M5 stainless steel bolts torqued to 6.0 Nm in +a star pattern. Apply a thin film of anti-seize compound to the bolt threads. Allow +the assembly to cure for 30 minutes before applying load. + +### 5.3 Gizmo installation +Mount Gizmos on a flat surface with M8 stainless steel bolts torqued to 18 Nm. +Do not exceed 22 Nm; over-torquing can crack the carbon-fiber housing. The thermal +cutoff cable must be routed away from heat sources and secured with the supplied +P-clips at intervals of no more than 200 mm. + +### 5.4 Sprocket installation +Press the Sprocket onto a 12 mm shaft using an arbor press. Hand pressure or +percussive installation will damage the bore tolerance. Once seated, install the +M12 set screw in the keyway and torque to 22 Nm. + +### 5.5 Periodic maintenance +Inspect mounting hardware every 6 months. Re-torque to spec if any fastener has +loosened. Replace any fastener that shows corrosion or thread damage. Clean exterior +surfaces with isopropyl alcohol and a microfiber cloth; do not use abrasive cleaners. + +## 6. Customer Support + +### 6.1 Contact channels +* Email: support@example.com (response within one business day) +* Priority email: priority@example.com (response within four hours for enterprise + customers with a current support agreement) +* Phone: 1-800-555-0100, Monday to Friday, 0800 to 1800 Eastern Time +* Self-service portal: https://support.example.com + +### 6.2 Service level agreements +The standard SLA is a one business day first response for general inquiries and a +four hour first response for priority inquiries. Critical production-down issues for +enterprise customers receive a one hour first response and a continuous-effort +resolution target until the issue is resolved. + +### 6.3 Replacement parts +Replacement parts including bolts, adapter plates, P-clips, and thermal cutoff +cables can be ordered directly from the support portal using the original order +number. Common parts ship the same business day if ordered before 1500 Eastern Time. + +## 7. Frequently Asked Questions + +Q. What is the maximum operating temperature of a Widget? +A. 80 degrees Celsius continuous, with a transient peak of 95 degrees Celsius for +up to 60 seconds. + +Q. Can I install a Gizmo with non-stainless bolts? +A. No. Using non-stainless bolts voids the corrosion portion of the warranty. + +Q. Does the Sprocket fit a 12 mm shaft? +A. Yes. The Sprocket bore is 12 mm with a standard 4 mm keyway. + +Q. What is the lead time for bulk orders of 250 units? +A. Standard lead time is 10 to 15 business days, plus shipping. + +Q. How do I know when my Gizmo's thermal cutoff has tripped? +A. The unit will go silent and the status LED will blink red twice per second. +After five minutes the unit will automatically re-enable and resume normal operation. + +Q. Where do I find the serial number? +A. Widget: laser-etched on the underside. Gizmo: engraved on the side. Sprocket: +laser-etched on the hub. + +Q. Are your products RoHS compliant? +A. Yes. All three products comply with EU RoHS 2 (Directive 2011/65/EU) and RoHS 3 +(Directive 2015/863). + +Q. Do you offer custom colours? +A. Custom finishes are available for orders of 500 or more units. Contact +sales@example.com for a custom-finish quote. Custom-finished products are +non-returnable. + +Q. What torque should I use for the M8 bolts on a Gizmo? +A. 18 Nm. Do not exceed 22 Nm. + +Q. How long is the warranty? +A. Two years from date of purchase against defects in materials and workmanship. """ TERMINAL_STATUSES = {JobStatus.SUCCEEDED, JobStatus.FAILED, JobStatus.CANCELLED} diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_evaluation.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_evaluation.py index 5a930105fb95..113bc8348e79 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_evaluation.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_evaluation.py @@ -132,7 +132,7 @@ time.sleep(5) if agent_eval_run.status == "completed": - print("\n✓ Evaluation run completed successfully!") + print("\n[OK] Evaluation run completed successfully!") print(f"Result Counts: {agent_eval_run.result_counts}") output_items = list( @@ -143,7 +143,7 @@ pprint(output_items) print(f"{'-'*60}") else: - print("\n✗ Evaluation run failed.") + print("\n[FAIL] Evaluation run failed.") openai_client.evals.delete(eval_id=eval_object.id) print("Evaluation deleted") diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_response_evaluation.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_response_evaluation.py index 2e60f43b3430..4bab6dfb8623 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_response_evaluation.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_response_evaluation.py @@ -113,7 +113,7 @@ time.sleep(5) if response_eval_run.status == "completed": - print("\n✓ Evaluation run completed successfully!") + print("\n[OK] Evaluation run completed successfully!") print(f"Result Counts: {response_eval_run.result_counts}") output_items = list( @@ -126,7 +126,7 @@ pprint(output_items) print(f"{'-'*60}") else: - print("\n✗ Evaluation run failed.") + print("\n[FAIL] Evaluation run failed.") openai_client.evals.delete(eval_id=eval_object.id) print("Evaluation deleted") diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_response_evaluation_with_function_tool.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_response_evaluation_with_function_tool.py index 03d33aa6dd40..2e1eff7feb8b 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_response_evaluation_with_function_tool.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_response_evaluation_with_function_tool.py @@ -167,7 +167,7 @@ def get_horoscope(sign: str) -> str: time.sleep(5) if response_eval_run.status == "completed": - print("\n✓ Evaluation run completed successfully!") + print("\n[OK] Evaluation run completed successfully!") print(f"Result Counts: {response_eval_run.result_counts}") output_items = list( @@ -181,7 +181,7 @@ def get_horoscope(sign: str) -> str: print(f"{'-'*60}") else: print(f"Eval Run Report URL: {response_eval_run.report_url}") - print("\n✗ Evaluation run failed.") + print("\n[FAIL] Evaluation run failed.") openai_client.evals.delete(eval_id=eval_object.id) print("Evaluation deleted") diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_trace_evaluation_smart_filter.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_trace_evaluation_smart_filter.py index bb1e44c4898a..9286b214e5f0 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_trace_evaluation_smart_filter.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_trace_evaluation_smart_filter.py @@ -159,7 +159,7 @@ time.sleep(5) if run.status == "completed": - print("\n✓ Evaluation run completed successfully!") + print("\n[OK] Evaluation run completed successfully!") print(f"Result Counts: {run.result_counts}") output_items = list(client.evals.runs.output_items.list(run_id=run.id, eval_id=eval_object.id)) @@ -170,7 +170,7 @@ print(f"\nEval Run Report URL: {run.report_url}") else: - print(f"\n✗ Evaluation run failed: {run.error}") + print(f"\n[FAIL] Evaluation run failed: {run.error}") client.evals.delete(eval_id=eval_object.id) print("Evaluation deleted") \ No newline at end of file diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluation_cluster_insight.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluation_cluster_insight.py index a125dd62ed04..d4613060054f 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluation_cluster_insight.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluation_cluster_insight.py @@ -119,7 +119,7 @@ # If the eval run completed successfully, generate cluster insights if eval_run.status == "completed": - print("\n✓ Evaluation run completed successfully!") + print("\n[OK] Evaluation run completed successfully!") print(f"Evaluation run result counts: {eval_run.result_counts}") clusterInsight = project_client.beta.insights.generate( @@ -141,13 +141,13 @@ time.sleep(5) if clusterInsight.state == OperationState.SUCCEEDED: - print("\n✓ Cluster insights generated successfully!") + print("\n[OK] Cluster insights generated successfully!") pprint(clusterInsight) else: - print("\n✗ Cluster insight generation failed.") + print("\n[FAIL] Cluster insight generation failed.") else: - print("\n✗ Evaluation run failed. Cannot generate cluster insights.") + print("\n[FAIL] Evaluation run failed. Cannot generate cluster insights.") openai_client.evals.delete(eval_id=eval_object.id) print("Evaluation deleted") diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluation_compare_insight.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluation_compare_insight.py index 0b48752f4a90..d7bc4ab6e1d0 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluation_compare_insight.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluation_compare_insight.py @@ -131,7 +131,7 @@ failed_runs = [run for run in completed_runs.values() if run.status == "failed"] if not failed_runs: - print("\n✓ Both evaluation runs completed successfully!") + print("\n[OK] Both evaluation runs completed successfully!") # Generate comparison insights compareInsight = project_client.beta.insights.generate( @@ -150,11 +150,11 @@ time.sleep(5) if compareInsight.state == OperationState.SUCCEEDED: - print("\n✓ Evaluation comparison generated successfully!") + print("\n[OK] Evaluation comparison generated successfully!") pprint(compareInsight) else: - print("\n✗ One or more eval runs failed. Cannot generate comparison insight.") + print("\n[FAIL] One or more eval runs failed. Cannot generate comparison insight.") openai_client.evals.delete(eval_id=eval_object.id) print("Evaluation deleted") diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_model_evaluation.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_model_evaluation.py index 82b18c6f045d..e03dfd32a381 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_model_evaluation.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_model_evaluation.py @@ -112,7 +112,7 @@ time.sleep(5) if agent_eval_run.status == "completed": - print("\n✓ Evaluation run completed successfully!") + print("\n[OK] Evaluation run completed successfully!") print(f"Result Counts: {agent_eval_run.result_counts}") output_items = list( @@ -123,7 +123,7 @@ pprint(output_items) print(f"{'-'*60}") else: - print("\n✗ Evaluation run failed.") + print("\n[FAIL] Evaluation run failed.") openai_client.evals.delete(eval_id=eval_object.id) print("Evaluation deleted") diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_model_evaluation_instant_model.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_model_evaluation_instant_model.py index 1517b05eab62..10280bc82927 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_model_evaluation_instant_model.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_model_evaluation_instant_model.py @@ -112,7 +112,7 @@ time.sleep(5) if agent_eval_run.status == "completed": - print("\n✓ Evaluation run completed successfully!") + print("\n[OK] Evaluation run completed successfully!") print(f"Result Counts: {agent_eval_run.result_counts}") output_items = list( @@ -123,7 +123,7 @@ pprint(output_items) print(f"{'-'*60}") else: - print("\n✗ Evaluation run failed.") + print("\n[FAIL] Evaluation run failed.") openai_client.evals.delete(eval_id=eval_object.id) print("Evaluation deleted") diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_conversation_evaluation.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_conversation_evaluation.py index 375478f59768..4dc36168008c 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_conversation_evaluation.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_conversation_evaluation.py @@ -159,7 +159,7 @@ time.sleep(5) if run.status == "completed": - print("\n✓ Evaluation run completed successfully!") + print("\n[OK] Evaluation run completed successfully!") print(f"Result Counts: {run.result_counts}") output_items = list(client.evals.runs.output_items.list(run_id=run.id, eval_id=eval_object.id)) @@ -170,7 +170,7 @@ print(f"\nEval Run Report URL: {run.report_url}") else: - print(f"\n✗ Evaluation run failed: {run.error}") + print(f"\n[FAIL] Evaluation run failed: {run.error}") client.evals.delete(eval_id=eval_object.id) print("Evaluation deleted") diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_conversation_simulation.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_conversation_simulation.py index 6dd7db32cd9f..76f7a7a8f01b 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_conversation_simulation.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_conversation_simulation.py @@ -20,7 +20,7 @@ Key concepts: - data_source type is "azure_ai_target_completions" with item_generation_params.type = "conversation_gen_preview" - - num_conversations is per seed scenario (e.g., 2 conversations × 3 scenarios = 6 total) + - num_conversations is per seed scenario (e.g., 2 conversations per scenario) - max_turns controls the maximum exchanges per conversation - The seed scenarios source is at the data_source root level @@ -40,6 +40,8 @@ import os import time +import uuid +from datetime import datetime, timezone from pprint import pprint from dotenv import load_dotenv from openai.types.eval_create_params import DataSourceConfigCustom @@ -58,6 +60,10 @@ data_folder = os.environ.get("DATA_FOLDER", os.path.join(script_dir, "data_folder")) scenarios_file = os.path.join(data_folder, "sample_data_simulation_scenarios.jsonl") +# Tag every run with a unique id so each invocation uploads a fresh dataset rather +# than silently re-using a stale cached version on the service. +run_id = f"{datetime.now(tz=timezone.utc).strftime('%y%m%d%H%M%S')}-{uuid.uuid4().hex[:4]}" + with ( DefaultAzureCredential() as credential, AIProjectClient(endpoint=endpoint, credential=credential) as project_client, @@ -127,20 +133,18 @@ ) print(f"Evaluation created (id: {eval_object.id})") - # Upload the simulation scenarios dataset - try: - dataset = project_client.datasets.upload_file( - name="simulation-scenarios", - version="1", - file_path=scenarios_file, - ) - assert dataset.id is not None, "Dataset upload returned no ID" - scenarios_id: str = dataset.id - print(f"Scenarios dataset uploaded (id: {scenarios_id})") - except Exception: - # Dataset already exists — use the existing URI - scenarios_id = f"azureai://accounts/{endpoint.split('/')[2].split('.')[0]}/projects/{endpoint.rstrip('/').split('/')[-1]}/data/simulation-scenarios/versions/1" - print(f"Using existing scenarios dataset (id: {scenarios_id})") + # Upload the simulation scenarios dataset. The name is suffixed with `run_id` so + # every invocation creates a fresh dataset on the service; without this the + # service would reject a re-upload of a same-named dataset and the sample would + # silently fall back to whatever stale version was last cached. + dataset = project_client.datasets.upload_file( + name=f"simulation-scenarios-{run_id}", + version="1", + file_path=scenarios_file, + ) + assert dataset.id is not None, "Dataset upload returned no ID" + scenarios_id: str = dataset.id + print(f"Scenarios dataset uploaded (id: {scenarios_id})") # Create a simulation run # - source: the seed scenarios dataset (each row is a test case) @@ -193,10 +197,12 @@ time.sleep(10) if run.status == "completed": - print("\n✓ Simulation run completed successfully!") + print("\n[OK] Simulation run completed successfully!") print(f"Result Counts: {run.result_counts}") - # With 3 seed scenarios and num_conversations=2, expect 6 total conversations - print(f"Expected: {3 * 2} conversations (3 scenarios × 2 per scenario)") + # Total conversations = (rows in scenarios_file) * num_conversations + with open(scenarios_file, encoding="utf-8") as f: + num_scenarios = sum(1 for line in f if line.strip()) + print(f"Expected: {num_scenarios * 2} conversations ({num_scenarios} scenarios x 2 per scenario)") output_items = list(client.evals.runs.output_items.list(run_id=run.id, eval_id=eval_object.id)) print(f"\nOUTPUT ITEMS (Total: {len(output_items)})") @@ -206,7 +212,7 @@ print(f"\nEval Run Report URL: {run.report_url}") else: - print(f"\n✗ Simulation run failed: {run.error}") + print(f"\n[FAIL] Simulation run failed: {run.error}") client.evals.delete(eval_id=eval_object.id) print("Evaluation deleted") diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_trace_evaluation_agent_filter.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_trace_evaluation_agent_filter.py index 7a65c9f6b4ba..d3a00ac1d34f 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_trace_evaluation_agent_filter.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_trace_evaluation_agent_filter.py @@ -162,7 +162,7 @@ def main() -> None: time.sleep(5) if run.status == "completed": - print("\n✓ Evaluation run completed successfully!") + print("\n[OK] Evaluation run completed successfully!") print(f"Result Counts: {run.result_counts}") output_items = list(client.evals.runs.output_items.list(run_id=run.id, eval_id=eval_object.id)) @@ -173,7 +173,7 @@ def main() -> None: print(f"\nEval Run Report URL: {run.report_url}") else: - print(f"\n✗ Evaluation run failed: {run.error}") + print(f"\n[FAIL] Evaluation run failed: {run.error}") client.evals.delete(eval_id=eval_object.id) print("Evaluation deleted") diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_trace_evaluation_by_id.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_trace_evaluation_by_id.py index f8117be7ae3b..e50bf5efde07 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_trace_evaluation_by_id.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_trace_evaluation_by_id.py @@ -149,7 +149,7 @@ time.sleep(5) if run.status == "completed": - print("\n✓ Evaluation run completed successfully!") + print("\n[OK] Evaluation run completed successfully!") print(f"Result Counts: {run.result_counts}") output_items = list(client.evals.runs.output_items.list(run_id=run.id, eval_id=eval_object.id)) @@ -160,7 +160,7 @@ print(f"\nEval Run Report URL: {run.report_url}") else: - print(f"\n✗ Evaluation run failed: {run.error}") + print(f"\n[FAIL] Evaluation run failed: {run.error}") client.evals.delete(eval_id=eval_object.id) print("Evaluation deleted") diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_scheduled_agent_traces_evaluation_smart_filter.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_scheduled_agent_traces_evaluation_smart_filter.py index 4e39ea2eb539..a41a69c78174 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_scheduled_agent_traces_evaluation_smart_filter.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_scheduled_agent_traces_evaluation_smart_filter.py @@ -176,7 +176,7 @@ def assign_rbac(): # pylint: disable=too-many-statements raise elif "RoleAssignmentExists" in error_message: - print("\n✅ ROLE ASSIGNMENT ALREADY EXISTS:") + print("\n[OK] ROLE ASSIGNMENT ALREADY EXISTS:") print("The 'Foundry User' role is already assigned to the project's managed identity.") print("No action needed - the required permissions are already in place.") @@ -194,7 +194,7 @@ def assign_rbac(): # pylint: disable=too-many-statements print("This usually indicates a service availability issue.") else: - print("\n❌ UNEXPECTED ERROR:") + print("\n[FAIL] UNEXPECTED ERROR:") print("An unexpected error occurred. Please check the error details above.") raise diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_scheduled_evaluations.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_scheduled_evaluations.py index 7155d540b640..6bd9a6d02505 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_scheduled_evaluations.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_scheduled_evaluations.py @@ -192,7 +192,7 @@ def assign_rbac(): # pylint: disable=too-many-statements raise elif "RoleAssignmentExists" in error_message: - print("\n✅ ROLE ASSIGNMENT ALREADY EXISTS:") + print("\n[OK] ROLE ASSIGNMENT ALREADY EXISTS:") print("The 'Azure AI User' role is already assigned to the project's managed identity.") print("No action needed - the required permissions are already in place.") @@ -210,7 +210,7 @@ def assign_rbac(): # pylint: disable=too-many-statements print("This usually indicates a service availability issue.") else: - print("\n❌ UNEXPECTED ERROR:") + print("\n[FAIL] UNEXPECTED ERROR:") print("An unexpected error occurred. Please check the error details above.") raise diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_synthetic_data_agent_evaluation.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_synthetic_data_agent_evaluation.py index dada3b8e2418..d390ea8b5b90 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_synthetic_data_agent_evaluation.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_synthetic_data_agent_evaluation.py @@ -147,7 +147,7 @@ time.sleep(5) if eval_run.status == "completed": - print("\n✓ Evaluation run completed successfully!") + print("\n[OK] Evaluation run completed successfully!") print(f"Result Counts: {eval_run.result_counts}") output_items = list(client.evals.runs.output_items.list(run_id=eval_run.id, eval_id=eval_object.id)) @@ -164,7 +164,7 @@ if output_dataset_id: print(f"Output Dataset ID (for reuse): {output_dataset_id}") else: - print("\n✗ Evaluation run failed.") + print("\n[FAIL] Evaluation run failed.") client.evals.delete(eval_id=eval_object.id) print("Evaluation deleted") diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_synthetic_data_model_evaluation.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_synthetic_data_model_evaluation.py index f155654b6a86..18dec83017ac 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_synthetic_data_model_evaluation.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_synthetic_data_model_evaluation.py @@ -151,7 +151,7 @@ time.sleep(5) if eval_run.status == "completed": - print("\n✓ Evaluation run completed successfully!") + print("\n[OK] Evaluation run completed successfully!") print(f"Result Counts: {eval_run.result_counts}") output_items = list(client.evals.runs.output_items.list(run_id=eval_run.id, eval_id=eval_object.id)) @@ -168,7 +168,7 @@ if output_dataset_id: print(f"Output Dataset ID (for reuse): {output_dataset_id}") else: - print("\n✗ Evaluation run failed.") + print("\n[FAIL] Evaluation run failed.") client.evals.delete(eval_id=eval_object.id) print("Evaluation deleted")