From 28c6e961646997154443ea726264c82292380909 Mon Sep 17 00:00:00 2001 From: Prachig-Microsoft Date: Tue, 12 May 2026 12:09:03 +0530 Subject: [PATCH 01/23] feat: Add LLM token usage tracking to Application Insights Implement comprehensive token usage tracking across all LLM call sites in ContentProcessor and ContentProcessorWorkflow, following the MACAE psl-token-usage branch pattern. Changes: - Add token_usage_utils.py with extract/emit helpers for both projects - Instrument MapHandler (ContentProcessor) with per-call token events - Instrument RAI, Summarize, GapAnalysis executors (Workflow) - Add summary and per-model token events in save_handler - Add 18 KQL queries for Azure Workbook visualization - Add unit tests (18 tests) for token_usage_utils Events emitted to Application Insights: - LLM_Agent_Token_Usage: per agent/step with model, process_id - LLM_Model_Token_Usage: per model deployment aggregation - LLM_Token_Usage_Summary: per document totals Ref: User Story #43251 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- infra/dashboards/token-usage-queries.kql | 283 ++++++++++++++++++ .../src/libs/pipeline/handlers/map_handler.py | 10 + .../libs/pipeline/handlers/save_handler.py | 22 ++ .../src/libs/token_usage_utils.py | 231 ++++++++++++++ .../tests/unit/libs/test_token_usage_utils.py | 231 ++++++++++++++ .../src/libs/token_usage_utils.py | 231 ++++++++++++++ .../gap_analysis/executor/gap_executor.py | 12 + .../src/steps/rai/executor/rai_executor.py | 12 + .../summarize/executor/summarize_executor.py | 12 + 9 files changed, 1044 insertions(+) create mode 100644 infra/dashboards/token-usage-queries.kql create mode 100644 src/ContentProcessor/src/libs/token_usage_utils.py create mode 100644 src/ContentProcessor/tests/unit/libs/test_token_usage_utils.py create mode 100644 src/ContentProcessorWorkflow/src/libs/token_usage_utils.py diff --git a/infra/dashboards/token-usage-queries.kql b/infra/dashboards/token-usage-queries.kql new file mode 100644 index 00000000..a08cd640 --- /dev/null +++ b/infra/dashboards/token-usage-queries.kql @@ -0,0 +1,283 @@ +// ============================================================ +// KQL Queries for LLM Token Usage Monitoring +// Content Processing Solution Accelerator +// Run these in Application Insights > Logs +// ============================================================ + +// 1. Overall token usage summary (last 7 days) +customEvents +| where name == 'LLM_Token_Usage_Summary' +| where timestamp > ago(7d) +| extend input_tokens = toint(customDimensions['total_input_tokens']) +| extend output_tokens = toint(customDimensions['total_output_tokens']) +| extend total_tokens = toint(customDimensions['total_tokens']) +| summarize + TotalDocuments = count(), + TotalInputTokens = sum(input_tokens), + TotalOutputTokens = sum(output_tokens), + TotalTokens = sum(total_tokens), + AvgTokensPerDocument = round(avg(total_tokens), 0) + +// 2. Token usage by pipeline step (agent) +customEvents +| where name == 'LLM_Agent_Token_Usage' +| where timestamp > ago(7d) +| extend agent = tostring(customDimensions['agent_name']) +| extend input_tokens = toint(customDimensions['input_tokens']) +| extend output_tokens = toint(customDimensions['output_tokens']) +| extend total_tokens = toint(customDimensions['total_tokens']) +| summarize + InputTokens = sum(input_tokens), + OutputTokens = sum(output_tokens), + TotalTokens = sum(total_tokens), + Invocations = count() + by Step = agent +| order by TotalTokens desc + +// 3. Token usage over time (hourly) +customEvents +| where name == 'LLM_Token_Usage_Summary' +| where timestamp > ago(7d) +| extend input_tokens = toint(customDimensions['total_input_tokens']) +| extend output_tokens = toint(customDimensions['total_output_tokens']) +| summarize InputTokens = sum(input_tokens), OutputTokens = sum(output_tokens) by bin(timestamp, 1h) +| order by timestamp asc +| render areachart + +// 4. Estimated cost (GPT-4o pricing: $2.50/1M input, $10.00/1M output) +let input_price_per_million = 2.50; +let output_price_per_million = 10.00; +customEvents +| where name == 'LLM_Token_Usage_Summary' +| where timestamp > ago(30d) +| extend input_tokens = toint(customDimensions['total_input_tokens']) +| extend output_tokens = toint(customDimensions['total_output_tokens']) +| summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by bin(timestamp, 1d) +| extend InputCost = round(TotalInput * input_price_per_million / 1000000.0, 4) +| extend OutputCost = round(TotalOutput * output_price_per_million / 1000000.0, 4) +| extend TotalCost = InputCost + OutputCost +| project Day = timestamp, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost +| order by Day desc + +// 5. Top token consumers by document +customEvents +| where name == 'LLM_Token_Usage_Summary' +| where timestamp > ago(7d) +| extend total_tokens = toint(customDimensions['total_tokens']) +| extend process_id = tostring(customDimensions['process_id']) +| extend file_name = tostring(customDimensions['file_name']) +| summarize TotalTokens = sum(total_tokens) by process_id, file_name +| order by TotalTokens desc +| take 20 + +// 6. Pipeline step token distribution (pie chart) +customEvents +| where name == 'LLM_Agent_Token_Usage' +| where timestamp > ago(7d) +| extend agent = tostring(customDimensions['agent_name']) +| extend total_tokens = toint(customDimensions['total_tokens']) +| summarize TotalTokens = sum(total_tokens) by agent +| render piechart + +// 7. Token usage percentiles per document +customEvents +| where name == 'LLM_Token_Usage_Summary' +| where timestamp > ago(7d) +| extend total_tokens = toint(customDimensions['total_tokens']) +| summarize + p50 = percentile(total_tokens, 50), + p90 = percentile(total_tokens, 90), + p95 = percentile(total_tokens, 95), + p99 = percentile(total_tokens, 99), + Max = max(total_tokens) + +// 8. Token usage by step grouping (Extraction vs Analysis vs Safety) +let StepGroupMapping = datatable(agent:string, StepGroup:string) [ + "MapHandler", "Extraction", + "RAI", "Safety", + "Summarize", "Analysis", + "GapAnalysis", "Analysis" +]; +customEvents +| where name == 'LLM_Agent_Token_Usage' +| where timestamp > ago(7d) +| extend agent = tostring(customDimensions['agent_name']) +| extend input_tokens = toint(customDimensions['input_tokens']) +| extend output_tokens = toint(customDimensions['output_tokens']) +| extend total_tokens = toint(customDimensions['total_tokens']) +| lookup kind=leftouter StepGroupMapping on agent +| extend StepGroup = iff(isempty(StepGroup), "Unknown", StepGroup) +| summarize + TotalRequests = count(), + TotalInputTokens = sum(input_tokens), + TotalOutputTokens = sum(output_tokens), + TotalTokens = sum(total_tokens), + AvgTokensPerRequest = round(avg(total_tokens), 0) + by StepGroup +| order by TotalTokens desc + +// 9. Token usage by model deployment +customEvents +| where name == 'LLM_Model_Token_Usage' +| where timestamp > ago(7d) +| extend model = tostring(customDimensions['model_deployment_name']) +| extend input_tokens = toint(customDimensions['input_tokens']) +| extend output_tokens = toint(customDimensions['output_tokens']) +| extend total_tokens = toint(customDimensions['total_tokens']) +| summarize + InputTokens = sum(input_tokens), + OutputTokens = sum(output_tokens), + TotalTokens = sum(total_tokens), + Invocations = count() + by Model = model +| order by TotalTokens desc + +// 10. Token usage by model over time (hourly) +customEvents +| where name == 'LLM_Model_Token_Usage' +| where timestamp > ago(7d) +| extend model = tostring(customDimensions['model_deployment_name']) +| extend total_tokens = toint(customDimensions['total_tokens']) +| summarize TotalTokens = sum(total_tokens) by bin(timestamp, 1h), model +| order by timestamp asc +| render areachart + +// 11. Model token distribution (pie chart) +customEvents +| where name == 'LLM_Model_Token_Usage' +| where timestamp > ago(7d) +| extend model = tostring(customDimensions['model_deployment_name']) +| extend total_tokens = toint(customDimensions['total_tokens']) +| summarize TotalTokens = sum(total_tokens) by model +| render piechart + +// 12. Estimated cost by model (adjust pricing per model) +let gpt4o_input = 2.50; +let gpt4o_output = 10.00; +let gpt4o_mini_input = 0.15; +let gpt4o_mini_output = 0.60; +customEvents +| where name == 'LLM_Model_Token_Usage' +| where timestamp > ago(30d) +| extend model = tostring(customDimensions['model_deployment_name']) +| extend input_tokens = toint(customDimensions['input_tokens']) +| extend output_tokens = toint(customDimensions['output_tokens']) +| summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by model +| extend InputPrice = case( + model has "mini", gpt4o_mini_input, + gpt4o_input) +| extend OutputPrice = case( + model has "mini", gpt4o_mini_output, + gpt4o_output) +| extend InputCost = round(TotalInput * InputPrice / 1000000.0, 4) +| extend OutputCost = round(TotalOutput * OutputPrice / 1000000.0, 4) +| extend TotalCost = InputCost + OutputCost +| project Model = model, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost +| order by TotalCost desc + +// 13. Step-to-model mapping with token usage +customEvents +| where name == 'LLM_Agent_Token_Usage' +| where timestamp > ago(7d) +| extend agent = tostring(customDimensions['agent_name']) +| extend model = tostring(customDimensions['model_deployment_name']) +| extend input_tokens = toint(customDimensions['input_tokens']) +| extend output_tokens = toint(customDimensions['output_tokens']) +| extend total_tokens = toint(customDimensions['total_tokens']) +| summarize + InputTokens = sum(input_tokens), + OutputTokens = sum(output_tokens), + TotalTokens = sum(total_tokens), + Invocations = count() + by Step = agent, Model = model +| order by TotalTokens desc + +// 14. RAI agent specific token usage +customEvents +| where name == 'LLM_Agent_Token_Usage' +| where timestamp > ago(7d) +| extend agent = tostring(customDimensions['agent_name']) +| where agent == "RAI" +| extend input_tokens = toint(customDimensions['input_tokens']) +| extend output_tokens = toint(customDimensions['output_tokens']) +| extend total_tokens = toint(customDimensions['total_tokens']) +| extend model = tostring(customDimensions['model_deployment_name']) +| summarize + InputTokens = sum(input_tokens), + OutputTokens = sum(output_tokens), + TotalTokens = sum(total_tokens), + Invocations = count() + by Model = model + +// 15. OpenTelemetry auto-instrumented OpenAI calls (if available) +dependencies +| where name has "openai" or target has "openai" +| where timestamp > ago(7d) +| extend input_tokens = tolong(customDimensions["gen_ai.usage.input_tokens"]) +| extend output_tokens = tolong(customDimensions["gen_ai.usage.output_tokens"]) +| extend model = tostring(customDimensions["gen_ai.request.model"]) +| where isnotnull(input_tokens) +| summarize + Calls = count(), + TotalInput = sum(input_tokens), + TotalOutput = sum(output_tokens) + by model +| order by TotalInput desc + +// ============================================================ +// Content Processing Specific Queries +// ============================================================ + +// 16. Token usage by file type (PDF, DOCX, image, etc.) +customEvents +| where name == 'LLM_Token_Usage_Summary' +| where timestamp > ago(7d) +| extend total_tokens = toint(customDimensions['total_tokens']) +| extend input_tokens = toint(customDimensions['total_input_tokens']) +| extend output_tokens = toint(customDimensions['total_output_tokens']) +| extend mime_type = tostring(customDimensions['file_mime_type']) +| extend file_type = case( + mime_type has "pdf", "PDF", + mime_type has "image", "Image", + mime_type has "word" or mime_type has "docx", "Word", + mime_type has "excel" or mime_type has "xlsx", "Excel", + mime_type has "text", "Text", + "Other") +| summarize + Documents = count(), + TotalInputTokens = sum(input_tokens), + TotalOutputTokens = sum(output_tokens), + TotalTokens = sum(total_tokens), + AvgTokensPerDoc = round(avg(total_tokens), 0) + by FileType = file_type +| order by TotalTokens desc + +// 17. Per-document token breakdown by step +customEvents +| where name == 'LLM_Agent_Token_Usage' +| where timestamp > ago(7d) +| extend agent = tostring(customDimensions['agent_name']) +| extend process_id = tostring(customDimensions['process_id']) +| extend input_tokens = toint(customDimensions['input_tokens']) +| extend output_tokens = toint(customDimensions['output_tokens']) +| extend total_tokens = toint(customDimensions['total_tokens']) +| summarize + InputTokens = sum(input_tokens), + OutputTokens = sum(output_tokens), + TotalTokens = sum(total_tokens) + by process_id, Step = agent +| order by process_id, TotalTokens desc + +// 18. Daily processing volume with token costs +customEvents +| where name == 'LLM_Token_Usage_Summary' +| where timestamp > ago(30d) +| extend total_tokens = toint(customDimensions['total_tokens']) +| extend file_name = tostring(customDimensions['file_name']) +| summarize + DocumentsProcessed = count(), + TotalTokens = sum(total_tokens), + AvgTokensPerDoc = round(avg(total_tokens), 0), + MaxTokensPerDoc = max(total_tokens) + by Day = bin(timestamp, 1d) +| order by Day desc diff --git a/src/ContentProcessor/src/libs/pipeline/handlers/map_handler.py b/src/ContentProcessor/src/libs/pipeline/handlers/map_handler.py index f3f20cb3..4eb964fe 100644 --- a/src/ContentProcessor/src/libs/pipeline/handlers/map_handler.py +++ b/src/ContentProcessor/src/libs/pipeline/handlers/map_handler.py @@ -28,6 +28,7 @@ from libs.pipeline.entities.pipeline_step_result import StepResult from libs.pipeline.entities.schema import Schema from libs.pipeline.queue_handler_base import HandlerBase +from libs.token_usage_utils import emit_agent_token_event, extract_token_usage from libs.utils.remote_schema_loader import load_schema_from_blob_json logger = logging.getLogger(__name__) @@ -263,6 +264,15 @@ async def execute(self, context: MessageContext) -> StepResult: options={"logprobs": True, "top_logprobs": 5}, ) + # Track token usage for this LLM call + token_usage = extract_token_usage(gpt_response) + emit_agent_token_event( + agent_name="MapHandler", + model_deployment_name=self.application_context.configuration.app_azure_openai_model, + usage=token_usage, + process_id=context.data_pipeline.pipeline_status.process_id, + ) + response_content = gpt_response.text # Json format string cleaned_content = ( diff --git a/src/ContentProcessor/src/libs/pipeline/handlers/save_handler.py b/src/ContentProcessor/src/libs/pipeline/handlers/save_handler.py index 15c90f56..33c15a66 100644 --- a/src/ContentProcessor/src/libs/pipeline/handlers/save_handler.py +++ b/src/ContentProcessor/src/libs/pipeline/handlers/save_handler.py @@ -20,6 +20,7 @@ from libs.pipeline.entities.schema import Schema from libs.pipeline.handlers.logics.evaluate_handler.model import DataExtractionResult from libs.pipeline.queue_handler_base import HandlerBase +from libs.token_usage_utils import emit_model_token_event, emit_summary_token_event class SaveHandler(HandlerBase): @@ -168,6 +169,27 @@ def find_process_result(step_name: str): collection_name=self.application_context.configuration.app_cosmos_container_process, ) + # Emit token usage summary and per-model events to Application Insights + emit_summary_token_event( + total_input_tokens=evaluated_result.prompt_tokens, + total_output_tokens=evaluated_result.completion_tokens, + total_tokens=evaluated_result.prompt_tokens + evaluated_result.completion_tokens, + process_id=context.data_pipeline.pipeline_status.process_id, + file_name=context.data_pipeline.get_source_files()[0].name, + file_mime_type=context.data_pipeline.get_source_files()[0].mime_type, + agent_count=1, + model_count=1, + ) + emit_model_token_event( + model_deployment_name=self.application_context.configuration.app_azure_openai_model, + usage={ + "input_tokens": evaluated_result.prompt_tokens, + "output_tokens": evaluated_result.completion_tokens, + "total_tokens": evaluated_result.prompt_tokens + evaluated_result.completion_tokens, + }, + process_id=context.data_pipeline.pipeline_status.process_id, + ) + # save process_output to blob storage. processed_history = context.data_pipeline.add_file( file_name="step_outputs.json", artifact_type=ArtifactType.SavedContent diff --git a/src/ContentProcessor/src/libs/token_usage_utils.py b/src/ContentProcessor/src/libs/token_usage_utils.py new file mode 100644 index 00000000..1a8d657e --- /dev/null +++ b/src/ContentProcessor/src/libs/token_usage_utils.py @@ -0,0 +1,231 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Token usage tracking for LLM calls in the content processing pipeline. + +Extracts token counts from Azure OpenAI agent framework responses and emits +custom events to Application Insights for monitoring, cost estimation, and +performance optimization. +""" + +import logging +import os +from typing import Any + +logger = logging.getLogger(__name__) + + +def _track_event_if_configured(event_name: str, event_data: dict) -> None: + """Track a custom event to Application Insights if configured. + + Args: + event_name: Name of the custom event. + event_data: Dictionary of event properties (all values must be strings). + """ + connection_string = os.getenv("APPLICATIONINSIGHTS_CONNECTION_STRING") + if connection_string: + try: + from azure.monitor.events.extension import track_event + + track_event(event_name, event_data) + except Exception as exc: + logger.warning("Failed to track event '%s': %s", event_name, exc) + else: + logger.debug( + "Skipping track_event for %s: Application Insights is not configured", + event_name, + ) + + +def extract_token_usage(response: Any) -> dict[str, int]: + """Extract token usage from an agent framework ChatMessage response. + + Checks multiple attribute paths to handle different response shapes + from the agent framework SDK. + + Args: + response: The ChatMessage response object from agent.run(). + + Returns: + Dict with keys: input_tokens, output_tokens, total_tokens. + All default to 0 if not found. + """ + input_tokens = 0 + output_tokens = 0 + total_tokens = 0 + + # Path 1: usage_details attribute (set by agent framework SDK) + usage_details = getattr(response, "usage_details", None) + if isinstance(usage_details, dict): + input_tokens = _to_int( + usage_details.get("input_token_count") + or usage_details.get("prompt_tokens") + or usage_details.get("input_tokens") + ) + output_tokens = _to_int( + usage_details.get("output_token_count") + or usage_details.get("completion_tokens") + or usage_details.get("output_tokens") + ) + total_tokens = _to_int( + usage_details.get("total_token_count") + or usage_details.get("total_tokens") + ) or (input_tokens + output_tokens) + + # Path 2: raw_representation.usage (raw Azure OpenAI response) + if total_tokens == 0: + raw = getattr(response, "raw_representation", None) + if raw is not None: + usage_obj = getattr(raw, "usage", None) + if usage_obj is not None: + if isinstance(usage_obj, dict): + input_tokens = _to_int( + usage_obj.get("prompt_tokens") + or usage_obj.get("input_tokens") + ) + output_tokens = _to_int( + usage_obj.get("completion_tokens") + or usage_obj.get("output_tokens") + ) + total_tokens = _to_int( + usage_obj.get("total_tokens") + ) or (input_tokens + output_tokens) + else: + input_tokens = _to_int( + getattr(usage_obj, "prompt_tokens", 0) + or getattr(usage_obj, "input_tokens", 0) + ) + output_tokens = _to_int( + getattr(usage_obj, "completion_tokens", 0) + or getattr(usage_obj, "output_tokens", 0) + ) + total_tokens = _to_int( + getattr(usage_obj, "total_tokens", 0) + ) or (input_tokens + output_tokens) + + return { + "input_tokens": input_tokens, + "output_tokens": output_tokens, + "total_tokens": total_tokens, + } + + +def emit_agent_token_event( + agent_name: str, + model_deployment_name: str, + usage: dict[str, int], + process_id: str = "", +) -> None: + """Emit a per-agent token usage event to Application Insights. + + Args: + agent_name: Name of the pipeline step/agent (e.g. 'MapHandler', 'RAI'). + model_deployment_name: Azure OpenAI model deployment name. + usage: Dict with input_tokens, output_tokens, total_tokens. + process_id: Document processing ID for correlation. + """ + _track_event_if_configured("LLM_Agent_Token_Usage", { + "agent_name": agent_name, + "input_tokens": str(usage.get("input_tokens", 0)), + "output_tokens": str(usage.get("output_tokens", 0)), + "total_tokens": str(usage.get("total_tokens", 0)), + "model_deployment_name": model_deployment_name, + "process_id": process_id, + }) + logger.info( + "[TOKEN USAGE] agent=%s model=%s input=%d output=%d total=%d process=%s", + agent_name, + model_deployment_name, + usage.get("input_tokens", 0), + usage.get("output_tokens", 0), + usage.get("total_tokens", 0), + process_id, + ) + + +def emit_model_token_event( + model_deployment_name: str, + usage: dict[str, int], + process_id: str = "", +) -> None: + """Emit a per-model token usage event to Application Insights. + + Args: + model_deployment_name: Azure OpenAI model deployment name. + usage: Dict with input_tokens, output_tokens, total_tokens. + process_id: Document processing ID for correlation. + """ + _track_event_if_configured("LLM_Model_Token_Usage", { + "model_deployment_name": model_deployment_name, + "input_tokens": str(usage.get("input_tokens", 0)), + "output_tokens": str(usage.get("output_tokens", 0)), + "total_tokens": str(usage.get("total_tokens", 0)), + "process_id": process_id, + }) + + +def emit_summary_token_event( + total_input_tokens: int, + total_output_tokens: int, + total_tokens: int, + process_id: str = "", + file_name: str = "", + file_mime_type: str = "", + agent_count: int = 0, + model_count: int = 0, +) -> None: + """Emit a summary token usage event for a complete document processing run. + + Args: + total_input_tokens: Sum of all input tokens across all steps. + total_output_tokens: Sum of all output tokens across all steps. + total_tokens: Sum of all tokens across all steps. + process_id: Document processing ID. + file_name: Name of the processed file. + file_mime_type: MIME type of the processed file. + agent_count: Number of agents/steps that used tokens. + model_count: Number of distinct models used. + """ + _track_event_if_configured("LLM_Token_Usage_Summary", { + "total_input_tokens": str(total_input_tokens), + "total_output_tokens": str(total_output_tokens), + "total_tokens": str(total_tokens), + "process_id": process_id, + "file_name": file_name, + "file_mime_type": file_mime_type, + "agent_count": str(agent_count), + "model_count": str(model_count), + }) + logger.info( + "[TOKEN SUMMARY] process=%s file=%s input=%d output=%d total=%d agents=%d models=%d", + process_id, + file_name, + total_input_tokens, + total_output_tokens, + total_tokens, + agent_count, + model_count, + ) + + +def _to_int(val: object, default: int = 0) -> int: + """Safely convert a value to int. + + Args: + val: Value to convert. + default: Default if conversion fails. + + Returns: + Integer value or default. + """ + if val is None or isinstance(val, bool): + return default + if isinstance(val, int): + return val + if isinstance(val, float): + return int(val) + if isinstance(val, str): + s = val.strip() + if s.isdigit(): + return int(s) + return default diff --git a/src/ContentProcessor/tests/unit/libs/test_token_usage_utils.py b/src/ContentProcessor/tests/unit/libs/test_token_usage_utils.py new file mode 100644 index 00000000..3fcf374e --- /dev/null +++ b/src/ContentProcessor/tests/unit/libs/test_token_usage_utils.py @@ -0,0 +1,231 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Tests for libs.token_usage_utils (token usage extraction and event emission).""" + +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +from libs.token_usage_utils import ( + _to_int, + emit_agent_token_event, + emit_model_token_event, + emit_summary_token_event, + extract_token_usage, +) + + +# ── _to_int helper ───────────────────────────────────────────────────── + + +class TestToInt: + """Conversion helper for safely casting token counts.""" + + def test_none_returns_default(self): + assert _to_int(None) == 0 + + def test_bool_returns_default(self): + assert _to_int(True) == 0 + assert _to_int(False) == 0 + + def test_int_passthrough(self): + assert _to_int(42) == 42 + + def test_float_truncates(self): + assert _to_int(3.7) == 3 + + def test_digit_string(self): + assert _to_int("100") == 100 + + def test_non_digit_string_returns_default(self): + assert _to_int("abc") == 0 + + def test_custom_default(self): + assert _to_int(None, default=5) == 5 + + +# ── extract_token_usage ──────────────────────────────────────────────── + + +class TestExtractTokenUsage: + """Token extraction from various response shapes.""" + + def test_usage_details_dict_with_standard_keys(self): + response = MagicMock() + response.usage_details = { + "input_token_count": 100, + "output_token_count": 50, + "total_token_count": 150, + } + result = extract_token_usage(response) + assert result == { + "input_tokens": 100, + "output_tokens": 50, + "total_tokens": 150, + } + + def test_usage_details_dict_with_openai_keys(self): + response = MagicMock() + response.usage_details = { + "prompt_tokens": 200, + "completion_tokens": 80, + "total_tokens": 280, + } + result = extract_token_usage(response) + assert result == { + "input_tokens": 200, + "output_tokens": 80, + "total_tokens": 280, + } + + def test_usage_details_none_falls_to_raw_representation(self): + response = MagicMock() + response.usage_details = None + usage_obj = MagicMock() + usage_obj.prompt_tokens = 300 + usage_obj.completion_tokens = 120 + usage_obj.total_tokens = 420 + usage_obj.input_tokens = 0 + usage_obj.output_tokens = 0 + response.raw_representation.usage = usage_obj + result = extract_token_usage(response) + assert result == { + "input_tokens": 300, + "output_tokens": 120, + "total_tokens": 420, + } + + def test_raw_representation_dict_usage(self): + response = MagicMock() + response.usage_details = None + response.raw_representation.usage = { + "prompt_tokens": 50, + "completion_tokens": 25, + "total_tokens": 75, + } + result = extract_token_usage(response) + assert result == { + "input_tokens": 50, + "output_tokens": 25, + "total_tokens": 75, + } + + def test_no_usage_returns_zeros(self): + response = MagicMock() + response.usage_details = None + response.raw_representation = None + result = extract_token_usage(response) + assert result == { + "input_tokens": 0, + "output_tokens": 0, + "total_tokens": 0, + } + + def test_total_computed_from_input_output_when_missing(self): + response = MagicMock() + response.usage_details = { + "input_token_count": 100, + "output_token_count": 50, + } + result = extract_token_usage(response) + assert result["total_tokens"] == 150 + + +# ── emit_agent_token_event ───────────────────────────────────────────── + + +class TestEmitAgentTokenEvent: + """Custom event emission for per-agent token usage.""" + + @patch("libs.token_usage_utils._track_event_if_configured") + def test_emits_correct_event(self, mock_track): + usage = {"input_tokens": 100, "output_tokens": 50, "total_tokens": 150} + emit_agent_token_event( + agent_name="MapHandler", + model_deployment_name="gpt-4o", + usage=usage, + process_id="proc-123", + ) + mock_track.assert_called_once_with("LLM_Agent_Token_Usage", { + "agent_name": "MapHandler", + "input_tokens": "100", + "output_tokens": "50", + "total_tokens": "150", + "model_deployment_name": "gpt-4o", + "process_id": "proc-123", + }) + + +# ── emit_model_token_event ───────────────────────────────────────────── + + +class TestEmitModelTokenEvent: + """Custom event emission for per-model token usage.""" + + @patch("libs.token_usage_utils._track_event_if_configured") + def test_emits_correct_event(self, mock_track): + usage = {"input_tokens": 200, "output_tokens": 80, "total_tokens": 280} + emit_model_token_event( + model_deployment_name="gpt-4o", + usage=usage, + process_id="proc-456", + ) + mock_track.assert_called_once_with("LLM_Model_Token_Usage", { + "model_deployment_name": "gpt-4o", + "input_tokens": "200", + "output_tokens": "80", + "total_tokens": "280", + "process_id": "proc-456", + }) + + +# ── emit_summary_token_event ────────────────────────────────────────── + + +class TestEmitSummaryTokenEvent: + """Custom event emission for document-level token summary.""" + + @patch("libs.token_usage_utils._track_event_if_configured") + def test_emits_correct_event(self, mock_track): + emit_summary_token_event( + total_input_tokens=500, + total_output_tokens=200, + total_tokens=700, + process_id="proc-789", + file_name="test.pdf", + file_mime_type="application/pdf", + agent_count=2, + model_count=1, + ) + mock_track.assert_called_once_with("LLM_Token_Usage_Summary", { + "total_input_tokens": "500", + "total_output_tokens": "200", + "total_tokens": "700", + "process_id": "proc-789", + "file_name": "test.pdf", + "file_mime_type": "application/pdf", + "agent_count": "2", + "model_count": "1", + }) + + +# ── _track_event_if_configured ──────────────────────────────────────── + + +class TestTrackEventIfConfigured: + """Application Insights event tracking guard.""" + + @patch.dict("os.environ", {"APPLICATIONINSIGHTS_CONNECTION_STRING": "InstrumentationKey=test"}) + @patch("azure.monitor.events.extension.track_event") + def test_tracks_when_configured(self, mock_track_event): + from libs.token_usage_utils import _track_event_if_configured + + _track_event_if_configured("test_event", {"key": "value"}) + mock_track_event.assert_called_once_with("test_event", {"key": "value"}) + + @patch.dict("os.environ", {}, clear=True) + def test_skips_when_not_configured(self): + from libs.token_usage_utils import _track_event_if_configured + + _track_event_if_configured("test_event", {"key": "value"}) diff --git a/src/ContentProcessorWorkflow/src/libs/token_usage_utils.py b/src/ContentProcessorWorkflow/src/libs/token_usage_utils.py new file mode 100644 index 00000000..1a8d657e --- /dev/null +++ b/src/ContentProcessorWorkflow/src/libs/token_usage_utils.py @@ -0,0 +1,231 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Token usage tracking for LLM calls in the content processing pipeline. + +Extracts token counts from Azure OpenAI agent framework responses and emits +custom events to Application Insights for monitoring, cost estimation, and +performance optimization. +""" + +import logging +import os +from typing import Any + +logger = logging.getLogger(__name__) + + +def _track_event_if_configured(event_name: str, event_data: dict) -> None: + """Track a custom event to Application Insights if configured. + + Args: + event_name: Name of the custom event. + event_data: Dictionary of event properties (all values must be strings). + """ + connection_string = os.getenv("APPLICATIONINSIGHTS_CONNECTION_STRING") + if connection_string: + try: + from azure.monitor.events.extension import track_event + + track_event(event_name, event_data) + except Exception as exc: + logger.warning("Failed to track event '%s': %s", event_name, exc) + else: + logger.debug( + "Skipping track_event for %s: Application Insights is not configured", + event_name, + ) + + +def extract_token_usage(response: Any) -> dict[str, int]: + """Extract token usage from an agent framework ChatMessage response. + + Checks multiple attribute paths to handle different response shapes + from the agent framework SDK. + + Args: + response: The ChatMessage response object from agent.run(). + + Returns: + Dict with keys: input_tokens, output_tokens, total_tokens. + All default to 0 if not found. + """ + input_tokens = 0 + output_tokens = 0 + total_tokens = 0 + + # Path 1: usage_details attribute (set by agent framework SDK) + usage_details = getattr(response, "usage_details", None) + if isinstance(usage_details, dict): + input_tokens = _to_int( + usage_details.get("input_token_count") + or usage_details.get("prompt_tokens") + or usage_details.get("input_tokens") + ) + output_tokens = _to_int( + usage_details.get("output_token_count") + or usage_details.get("completion_tokens") + or usage_details.get("output_tokens") + ) + total_tokens = _to_int( + usage_details.get("total_token_count") + or usage_details.get("total_tokens") + ) or (input_tokens + output_tokens) + + # Path 2: raw_representation.usage (raw Azure OpenAI response) + if total_tokens == 0: + raw = getattr(response, "raw_representation", None) + if raw is not None: + usage_obj = getattr(raw, "usage", None) + if usage_obj is not None: + if isinstance(usage_obj, dict): + input_tokens = _to_int( + usage_obj.get("prompt_tokens") + or usage_obj.get("input_tokens") + ) + output_tokens = _to_int( + usage_obj.get("completion_tokens") + or usage_obj.get("output_tokens") + ) + total_tokens = _to_int( + usage_obj.get("total_tokens") + ) or (input_tokens + output_tokens) + else: + input_tokens = _to_int( + getattr(usage_obj, "prompt_tokens", 0) + or getattr(usage_obj, "input_tokens", 0) + ) + output_tokens = _to_int( + getattr(usage_obj, "completion_tokens", 0) + or getattr(usage_obj, "output_tokens", 0) + ) + total_tokens = _to_int( + getattr(usage_obj, "total_tokens", 0) + ) or (input_tokens + output_tokens) + + return { + "input_tokens": input_tokens, + "output_tokens": output_tokens, + "total_tokens": total_tokens, + } + + +def emit_agent_token_event( + agent_name: str, + model_deployment_name: str, + usage: dict[str, int], + process_id: str = "", +) -> None: + """Emit a per-agent token usage event to Application Insights. + + Args: + agent_name: Name of the pipeline step/agent (e.g. 'MapHandler', 'RAI'). + model_deployment_name: Azure OpenAI model deployment name. + usage: Dict with input_tokens, output_tokens, total_tokens. + process_id: Document processing ID for correlation. + """ + _track_event_if_configured("LLM_Agent_Token_Usage", { + "agent_name": agent_name, + "input_tokens": str(usage.get("input_tokens", 0)), + "output_tokens": str(usage.get("output_tokens", 0)), + "total_tokens": str(usage.get("total_tokens", 0)), + "model_deployment_name": model_deployment_name, + "process_id": process_id, + }) + logger.info( + "[TOKEN USAGE] agent=%s model=%s input=%d output=%d total=%d process=%s", + agent_name, + model_deployment_name, + usage.get("input_tokens", 0), + usage.get("output_tokens", 0), + usage.get("total_tokens", 0), + process_id, + ) + + +def emit_model_token_event( + model_deployment_name: str, + usage: dict[str, int], + process_id: str = "", +) -> None: + """Emit a per-model token usage event to Application Insights. + + Args: + model_deployment_name: Azure OpenAI model deployment name. + usage: Dict with input_tokens, output_tokens, total_tokens. + process_id: Document processing ID for correlation. + """ + _track_event_if_configured("LLM_Model_Token_Usage", { + "model_deployment_name": model_deployment_name, + "input_tokens": str(usage.get("input_tokens", 0)), + "output_tokens": str(usage.get("output_tokens", 0)), + "total_tokens": str(usage.get("total_tokens", 0)), + "process_id": process_id, + }) + + +def emit_summary_token_event( + total_input_tokens: int, + total_output_tokens: int, + total_tokens: int, + process_id: str = "", + file_name: str = "", + file_mime_type: str = "", + agent_count: int = 0, + model_count: int = 0, +) -> None: + """Emit a summary token usage event for a complete document processing run. + + Args: + total_input_tokens: Sum of all input tokens across all steps. + total_output_tokens: Sum of all output tokens across all steps. + total_tokens: Sum of all tokens across all steps. + process_id: Document processing ID. + file_name: Name of the processed file. + file_mime_type: MIME type of the processed file. + agent_count: Number of agents/steps that used tokens. + model_count: Number of distinct models used. + """ + _track_event_if_configured("LLM_Token_Usage_Summary", { + "total_input_tokens": str(total_input_tokens), + "total_output_tokens": str(total_output_tokens), + "total_tokens": str(total_tokens), + "process_id": process_id, + "file_name": file_name, + "file_mime_type": file_mime_type, + "agent_count": str(agent_count), + "model_count": str(model_count), + }) + logger.info( + "[TOKEN SUMMARY] process=%s file=%s input=%d output=%d total=%d agents=%d models=%d", + process_id, + file_name, + total_input_tokens, + total_output_tokens, + total_tokens, + agent_count, + model_count, + ) + + +def _to_int(val: object, default: int = 0) -> int: + """Safely convert a value to int. + + Args: + val: Value to convert. + default: Default if conversion fails. + + Returns: + Integer value or default. + """ + if val is None or isinstance(val, bool): + return default + if isinstance(val, int): + return val + if isinstance(val, float): + return int(val) + if isinstance(val, str): + s = val.strip() + if s.isdigit(): + return int(s) + return default diff --git a/src/ContentProcessorWorkflow/src/steps/gap_analysis/executor/gap_executor.py b/src/ContentProcessorWorkflow/src/steps/gap_analysis/executor/gap_executor.py index 5390b09b..9d21d555 100644 --- a/src/ContentProcessorWorkflow/src/steps/gap_analysis/executor/gap_executor.py +++ b/src/ContentProcessorWorkflow/src/steps/gap_analysis/executor/gap_executor.py @@ -31,6 +31,8 @@ from steps.models.extracted_file import ExtractedFile from steps.models.output import Executor_Output, Workflow_Output +from libs.token_usage_utils import emit_agent_token_event, extract_token_usage + class GapExecutor(Executor): """Workflow executor that runs the GAP-analysis step. @@ -192,6 +194,16 @@ async def handle_execute( ) ) + # Track token usage for gap analysis + token_usage = extract_token_usage(model_response) + model_name = agent_framework_helper.settings.get_service_config("default").chat_deployment_name + emit_agent_token_event( + agent_name="GapAnalysis", + model_deployment_name=model_name, + usage=token_usage, + process_id=result.claim_process_id, + ) + claim_process_repository = self.app_context.get_service(Claim_Processes) await claim_process_repository.Update_Claim_Process_Gaps( process_id=result.claim_process_id, new_gaps=model_response.text diff --git a/src/ContentProcessorWorkflow/src/steps/rai/executor/rai_executor.py b/src/ContentProcessorWorkflow/src/steps/rai/executor/rai_executor.py index 64bca6f3..9a33735f 100644 --- a/src/ContentProcessorWorkflow/src/steps/rai/executor/rai_executor.py +++ b/src/ContentProcessorWorkflow/src/steps/rai/executor/rai_executor.py @@ -27,6 +27,8 @@ from services.content_process_service import ContentProcessService from steps.rai.model import rai_response +from libs.token_usage_utils import emit_agent_token_event, extract_token_usage + class RAIExecutor(Executor): """Workflow executor that applies Responsible-AI content analysis. @@ -186,6 +188,16 @@ async def handle_exectue( ) ) + # Track token usage for RAI check + token_usage = extract_token_usage(model_response) + model_name = agent_framework_helper.settings.get_service_config("default").chat_deployment_name + emit_agent_token_event( + agent_name="RAI", + model_deployment_name=model_name, + usage=token_usage, + process_id=result.claim_process_id, + ) + response_content = model_response.text parsed_response = rai_response.RAIResponse.model_validate_json(response_content) diff --git a/src/ContentProcessorWorkflow/src/steps/summarize/executor/summarize_executor.py b/src/ContentProcessorWorkflow/src/steps/summarize/executor/summarize_executor.py index a4e8b910..f2e5c8ca 100644 --- a/src/ContentProcessorWorkflow/src/steps/summarize/executor/summarize_executor.py +++ b/src/ContentProcessorWorkflow/src/steps/summarize/executor/summarize_executor.py @@ -28,6 +28,8 @@ from steps.models.extracted_file import ExtractedFile from steps.models.output import Executor_Output, Workflow_Output +from libs.token_usage_utils import emit_agent_token_event, extract_token_usage + class SummarizeExecutor(Executor): """Workflow executor that runs the summarization step. @@ -192,6 +194,16 @@ async def handle_execute( ) ) + # Track token usage for summarization + token_usage = extract_token_usage(model_response) + model_name = agent_framework_helper.settings.get_service_config("default").chat_deployment_name + emit_agent_token_event( + agent_name="Summarize", + model_deployment_name=model_name, + usage=token_usage, + process_id=result.claim_process_id, + ) + summarized_result = {"status": "summarized", "input": model_response.text} claim_process_repository = self.app_context.get_service(Claim_Processes) From 8bc890168debe982df34e60e3ff97b7c6f6f9e48 Mon Sep 17 00:00:00 2001 From: Prachig-Microsoft Date: Wed, 13 May 2026 21:22:27 +0530 Subject: [PATCH 02/23] fix: Handle UsageDetails object and add missing dependencies - Fix extract_token_usage() to handle UsageDetails object (not just dict) from agent framework SDK, resolving 0-token reports for RAI, Summarize, and GapAnalysis agents - Add azure-monitor-events-extension dependency to ContentProcessor and ContentProcessorWorkflow pyproject.toml - Add unit test for UsageDetails object handling - Enable Application Insights monitoring in main.parameters.json Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- infra/main.parameters.json | 3 ++ src/ContentProcessor/pyproject.toml | 1 + .../src/libs/token_usage_utils.py | 44 ++++++++++++------- .../tests/unit/libs/test_token_usage_utils.py | 15 +++++++ src/ContentProcessor/uv.lock | 15 +++++++ src/ContentProcessorWorkflow/pyproject.toml | 1 + .../src/libs/token_usage_utils.py | 44 ++++++++++++------- src/ContentProcessorWorkflow/uv.lock | 29 +++++++++--- 8 files changed, 115 insertions(+), 37 deletions(-) diff --git a/infra/main.parameters.json b/infra/main.parameters.json index 44153d57..d20510b0 100644 --- a/infra/main.parameters.json +++ b/infra/main.parameters.json @@ -29,6 +29,9 @@ "existingFoundryProjectResourceId": { "value": "${AZURE_EXISTING_AIPROJECT_RESOURCE_ID}" }, + "enableMonitoring": { + "value": true + }, "containerRegistryEndpoint": { "value": "${AZURE_ENV_CONTAINER_REGISTRY_ENDPOINT}" }, diff --git a/src/ContentProcessor/pyproject.toml b/src/ContentProcessor/pyproject.toml index 310524ce..b4b8172a 100644 --- a/src/ContentProcessor/pyproject.toml +++ b/src/ContentProcessor/pyproject.toml @@ -9,6 +9,7 @@ dependencies = [ "azure-ai-inference==1.0.0b9", "azure-appconfiguration==1.8.0", "azure-identity==1.26.0b1", + "azure-monitor-events-extension>=0.1.0", "azure-monitor-opentelemetry==1.8.7", "azure-storage-blob==12.29.0b1", "azure-storage-queue==12.16.0b1", diff --git a/src/ContentProcessor/src/libs/token_usage_utils.py b/src/ContentProcessor/src/libs/token_usage_utils.py index 1a8d657e..b88c5cd5 100644 --- a/src/ContentProcessor/src/libs/token_usage_utils.py +++ b/src/ContentProcessor/src/libs/token_usage_utils.py @@ -56,21 +56,35 @@ def extract_token_usage(response: Any) -> dict[str, int]: # Path 1: usage_details attribute (set by agent framework SDK) usage_details = getattr(response, "usage_details", None) - if isinstance(usage_details, dict): - input_tokens = _to_int( - usage_details.get("input_token_count") - or usage_details.get("prompt_tokens") - or usage_details.get("input_tokens") - ) - output_tokens = _to_int( - usage_details.get("output_token_count") - or usage_details.get("completion_tokens") - or usage_details.get("output_tokens") - ) - total_tokens = _to_int( - usage_details.get("total_token_count") - or usage_details.get("total_tokens") - ) or (input_tokens + output_tokens) + if usage_details is not None: + if isinstance(usage_details, dict): + input_tokens = _to_int( + usage_details.get("input_token_count") + or usage_details.get("prompt_tokens") + or usage_details.get("input_tokens") + ) + output_tokens = _to_int( + usage_details.get("output_token_count") + or usage_details.get("completion_tokens") + or usage_details.get("output_tokens") + ) + total_tokens = _to_int( + usage_details.get("total_token_count") + or usage_details.get("total_tokens") + ) or (input_tokens + output_tokens) + else: + # UsageDetails object with attributes + input_tokens = _to_int( + getattr(usage_details, "input_token_count", 0) + or getattr(usage_details, "prompt_tokens", 0) + ) + output_tokens = _to_int( + getattr(usage_details, "output_token_count", 0) + or getattr(usage_details, "completion_tokens", 0) + ) + total_tokens = _to_int( + getattr(usage_details, "total_token_count", 0) + ) or (input_tokens + output_tokens) # Path 2: raw_representation.usage (raw Azure OpenAI response) if total_tokens == 0: diff --git a/src/ContentProcessor/tests/unit/libs/test_token_usage_utils.py b/src/ContentProcessor/tests/unit/libs/test_token_usage_utils.py index 3fcf374e..0454e411 100644 --- a/src/ContentProcessor/tests/unit/libs/test_token_usage_utils.py +++ b/src/ContentProcessor/tests/unit/libs/test_token_usage_utils.py @@ -111,6 +111,21 @@ def test_raw_representation_dict_usage(self): "total_tokens": 75, } + def test_usage_details_object_with_attributes(self): + """Handle UsageDetails object (not dict) from agent framework.""" + response = MagicMock() + usage_obj = MagicMock() + usage_obj.input_token_count = 400 + usage_obj.output_token_count = 150 + usage_obj.total_token_count = 550 + response.usage_details = usage_obj + result = extract_token_usage(response) + assert result == { + "input_tokens": 400, + "output_tokens": 150, + "total_tokens": 550, + } + def test_no_usage_returns_zeros(self): response = MagicMock() response.usage_details = None diff --git a/src/ContentProcessor/uv.lock b/src/ContentProcessor/uv.lock index 4dcc1bf0..dcc6f3d4 100644 --- a/src/ContentProcessor/uv.lock +++ b/src/ContentProcessor/uv.lock @@ -698,6 +698,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e1/28/af9ef022f21e3b51b3718d4348f771b490678c1116563895547c0a771362/azure_identity-1.26.0b1-py3-none-any.whl", hash = "sha256:dc608b59ae628a38611208ee761adeb1a2b9390258b58d6edcda2d24c50a4348", size = 197227, upload-time = "2025-11-07T03:04:16.923Z" }, ] +[[package]] +name = "azure-monitor-events-extension" +version = "0.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, + { name = "opentelemetry-sdk" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/cd/51/976c8cd4a76d41bcd4d3f6400aeed8fdd70d516d271badf9c4a5893a558d/azure-monitor-events-extension-0.1.0.tar.gz", hash = "sha256:094773685171a50aa5cc548279c9141c8a26682f6acef397815c528b53b838b5", size = 4165, upload-time = "2023-09-19T20:01:17.887Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/09/44/cbb68c55505a604de61caa44375be7371368e71aa8386b1576be5b789e11/azure_monitor_events_extension-0.1.0-py2.py3-none-any.whl", hash = "sha256:5d92abb5e6a32ab23b12c726def9f9607c6fa1d84900d493b906ff9ec489af4a", size = 4514, upload-time = "2023-09-19T20:01:16.162Z" }, +] + [[package]] name = "azure-monitor-opentelemetry" version = "1.8.7" @@ -974,6 +987,7 @@ dependencies = [ { name = "azure-ai-inference" }, { name = "azure-appconfiguration" }, { name = "azure-identity" }, + { name = "azure-monitor-events-extension" }, { name = "azure-monitor-opentelemetry" }, { name = "azure-storage-blob" }, { name = "azure-storage-queue" }, @@ -1011,6 +1025,7 @@ requires-dist = [ { name = "azure-ai-inference", specifier = "==1.0.0b9" }, { name = "azure-appconfiguration", specifier = "==1.8.0" }, { name = "azure-identity", specifier = "==1.26.0b1" }, + { name = "azure-monitor-events-extension", specifier = ">=0.1.0" }, { name = "azure-monitor-opentelemetry", specifier = "==1.8.7" }, { name = "azure-storage-blob", specifier = "==12.29.0b1" }, { name = "azure-storage-queue", specifier = "==12.16.0b1" }, diff --git a/src/ContentProcessorWorkflow/pyproject.toml b/src/ContentProcessorWorkflow/pyproject.toml index 804ed5f4..2046388b 100644 --- a/src/ContentProcessorWorkflow/pyproject.toml +++ b/src/ContentProcessorWorkflow/pyproject.toml @@ -14,6 +14,7 @@ dependencies = [ "azure-appconfiguration==1.8.0", "azure-core==1.38.0", "azure-identity==1.26.0b1", + "azure-monitor-events-extension>=0.1.0", "azure-monitor-opentelemetry==1.8.7", "azure-storage-blob==12.29.0b1", "azure-storage-file-datalake==12.23.0", diff --git a/src/ContentProcessorWorkflow/src/libs/token_usage_utils.py b/src/ContentProcessorWorkflow/src/libs/token_usage_utils.py index 1a8d657e..b88c5cd5 100644 --- a/src/ContentProcessorWorkflow/src/libs/token_usage_utils.py +++ b/src/ContentProcessorWorkflow/src/libs/token_usage_utils.py @@ -56,21 +56,35 @@ def extract_token_usage(response: Any) -> dict[str, int]: # Path 1: usage_details attribute (set by agent framework SDK) usage_details = getattr(response, "usage_details", None) - if isinstance(usage_details, dict): - input_tokens = _to_int( - usage_details.get("input_token_count") - or usage_details.get("prompt_tokens") - or usage_details.get("input_tokens") - ) - output_tokens = _to_int( - usage_details.get("output_token_count") - or usage_details.get("completion_tokens") - or usage_details.get("output_tokens") - ) - total_tokens = _to_int( - usage_details.get("total_token_count") - or usage_details.get("total_tokens") - ) or (input_tokens + output_tokens) + if usage_details is not None: + if isinstance(usage_details, dict): + input_tokens = _to_int( + usage_details.get("input_token_count") + or usage_details.get("prompt_tokens") + or usage_details.get("input_tokens") + ) + output_tokens = _to_int( + usage_details.get("output_token_count") + or usage_details.get("completion_tokens") + or usage_details.get("output_tokens") + ) + total_tokens = _to_int( + usage_details.get("total_token_count") + or usage_details.get("total_tokens") + ) or (input_tokens + output_tokens) + else: + # UsageDetails object with attributes + input_tokens = _to_int( + getattr(usage_details, "input_token_count", 0) + or getattr(usage_details, "prompt_tokens", 0) + ) + output_tokens = _to_int( + getattr(usage_details, "output_token_count", 0) + or getattr(usage_details, "completion_tokens", 0) + ) + total_tokens = _to_int( + getattr(usage_details, "total_token_count", 0) + ) or (input_tokens + output_tokens) # Path 2: raw_representation.usage (raw Azure OpenAI response) if total_tokens == 0: diff --git a/src/ContentProcessorWorkflow/uv.lock b/src/ContentProcessorWorkflow/uv.lock index 9fd628a0..608166a4 100644 --- a/src/ContentProcessorWorkflow/uv.lock +++ b/src/ContentProcessorWorkflow/uv.lock @@ -736,6 +736,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e1/28/af9ef022f21e3b51b3718d4348f771b490678c1116563895547c0a771362/azure_identity-1.26.0b1-py3-none-any.whl", hash = "sha256:dc608b59ae628a38611208ee761adeb1a2b9390258b58d6edcda2d24c50a4348", size = 197227, upload-time = "2025-11-07T03:04:16.923Z" }, ] +[[package]] +name = "azure-monitor-events-extension" +version = "0.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, + { name = "opentelemetry-sdk" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/cd/51/976c8cd4a76d41bcd4d3f6400aeed8fdd70d516d271badf9c4a5893a558d/azure-monitor-events-extension-0.1.0.tar.gz", hash = "sha256:094773685171a50aa5cc548279c9141c8a26682f6acef397815c528b53b838b5", size = 4165, upload-time = "2023-09-19T20:01:17.887Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/09/44/cbb68c55505a604de61caa44375be7371368e71aa8386b1576be5b789e11/azure_monitor_events_extension-0.1.0-py2.py3-none-any.whl", hash = "sha256:5d92abb5e6a32ab23b12c726def9f9607c6fa1d84900d493b906ff9ec489af4a", size = 4514, upload-time = "2023-09-19T20:01:16.162Z" }, +] + [[package]] name = "azure-monitor-opentelemetry" version = "1.8.7" @@ -2960,6 +2973,7 @@ dependencies = [ { name = "azure-appconfiguration" }, { name = "azure-core" }, { name = "azure-identity" }, + { name = "azure-monitor-events-extension" }, { name = "azure-monitor-opentelemetry" }, { name = "azure-storage-blob" }, { name = "azure-storage-file-datalake" }, @@ -3001,6 +3015,7 @@ requires-dist = [ { name = "azure-appconfiguration", specifier = "==1.8.0" }, { name = "azure-core", specifier = "==1.38.0" }, { name = "azure-identity", specifier = "==1.26.0b1" }, + { name = "azure-monitor-events-extension", specifier = ">=0.1.0" }, { name = "azure-monitor-opentelemetry", specifier = "==1.8.7" }, { name = "azure-storage-blob", specifier = "==12.29.0b1" }, { name = "azure-storage-file-datalake", specifier = "==12.23.0" }, @@ -3015,7 +3030,7 @@ requires-dist = [ { name = "psutil", specifier = "==7.2.1" }, { name = "pyasn1", specifier = "==0.6.3" }, { name = "pyjwt", specifier = "==2.12.1" }, - { name = "python-multipart", specifier = "==0.0.27" }, + { name = "python-multipart", specifier = "==0.0.26" }, { name = "pytz", specifier = "==2025.2" }, { name = "sas-cosmosdb", specifier = "==0.1.4" }, { name = "sas-storage", specifier = "==1.0.0" }, @@ -3482,11 +3497,11 @@ wheels = [ [[package]] name = "python-multipart" -version = "0.0.27" +version = "0.0.26" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/69/9b/f23807317a113dc36e74e75eb265a02dd1a4d9082abc3c1064acd22997c4/python_multipart-0.0.27.tar.gz", hash = "sha256:9870a6a8c5a20a5bf4f07c017bd1489006ff8836cff097b6933355ee2b49b602", size = 44043, upload-time = "2026-04-27T10:51:26.649Z" } +sdist = { url = "https://files.pythonhosted.org/packages/88/71/b145a380824a960ebd60e1014256dbb7d2253f2316ff2d73dfd8928ec2c3/python_multipart-0.0.26.tar.gz", hash = "sha256:08fadc45918cd615e26846437f50c5d6d23304da32c341f289a617127b081f17", size = 43501, upload-time = "2026-04-10T14:09:59.473Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/99/78/4126abcbdbd3c559d43e0db7f7b9173fc6befe45d39a2856cc0b8ec2a5a6/python_multipart-0.0.27-py3-none-any.whl", hash = "sha256:6fccfad17a27334bd0193681b369f476eda3409f17381a2d65aa7df3f7275645", size = 29254, upload-time = "2026-04-27T10:51:24.997Z" }, + { url = "https://files.pythonhosted.org/packages/9a/22/f1925cdda983ab66fc8ec6ec8014b959262747e58bdca26a4e3d1da29d56/python_multipart-0.0.26-py3-none-any.whl", hash = "sha256:c0b169f8c4484c13b0dcf2ef0ec3a4adb255c4b7d18d8e420477d2b1dd03f185", size = 28847, upload-time = "2026-04-10T14:09:58.131Z" }, ] [[package]] @@ -4077,11 +4092,11 @@ wheels = [ [[package]] name = "urllib3" -version = "2.7.0" +version = "2.6.3" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/53/0c/06f8b233b8fd13b9e5ee11424ef85419ba0d8ba0b3138bf360be2ff56953/urllib3-2.7.0.tar.gz", hash = "sha256:231e0ec3b63ceb14667c67be60f2f2c40a518cb38b03af60abc813da26505f4c", size = 433602, upload-time = "2026-05-07T16:13:18.596Z" } +sdist = { url = "https://files.pythonhosted.org/packages/c7/24/5f1b3bdffd70275f6661c76461e25f024d5a38a46f04aaca912426a2b1d3/urllib3-2.6.3.tar.gz", hash = "sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed", size = 435556, upload-time = "2026-01-07T16:24:43.925Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/7f/3e/5db95bcf282c52709639744ca2a8b149baccf648e39c8cc87553df9eae0c/urllib3-2.7.0-py3-none-any.whl", hash = "sha256:9fb4c81ebbb1ce9531cce37674bbc6f1360472bc18ca9a553ede278ef7276897", size = 131087, upload-time = "2026-05-07T16:13:17.151Z" }, + { url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" }, ] [[package]] From bcdcc255063b69478ab0a654430d3c0471b09e84 Mon Sep 17 00:00:00 2001 From: Prachig-Microsoft Date: Wed, 13 May 2026 21:52:35 +0530 Subject: [PATCH 03/23] feat: Add portable Azure Workbook template for token usage dashboard Adds a standalone workbook JSON template that can be imported into any Application Insights instance without being tied to a specific resource. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- infra/dashboards/token-usage-workbook.json | 236 +++++++++++++++++++++ 1 file changed, 236 insertions(+) create mode 100644 infra/dashboards/token-usage-workbook.json diff --git a/infra/dashboards/token-usage-workbook.json b/infra/dashboards/token-usage-workbook.json new file mode 100644 index 00000000..367956ad --- /dev/null +++ b/infra/dashboards/token-usage-workbook.json @@ -0,0 +1,236 @@ +{ + "version": "Notebook/1.0", + "$schema": "https://github.com/Microsoft/Application-Insights-Workbooks/blob/master/schema/workbook.json", + "items": [ + { + "type": 1, + "content": { + "json": "# LLM Token Usage Dashboard\n\nMonitors token consumption across all pipeline agents in Content Processing Solution Accelerator.\n\n---" + }, + "name": "title" + }, + { + "type": 9, + "content": { + "version": "KqlParameterItem/1.0", + "parameters": [ + { + "id": "a0b1c2d3-e4f5-6789-abcd-ef0123456789", + "version": "KqlParameterItem/1.0", + "name": "TimeRange", + "label": "Time Range", + "type": 4, + "isRequired": true, + "value": { + "durationMs": 604800000 + }, + "typeSettings": { + "selectableValues": [ + { "durationMs": 3600000 }, + { "durationMs": 86400000 }, + { "durationMs": 604800000 }, + { "durationMs": 2592000000 } + ] + } + } + ], + "style": "pills" + }, + "name": "parameters" + }, + { + "type": 1, + "content": { + "json": "## Overview" + }, + "name": "section-overview" + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "customEvents\n| where name == 'LLM_Token_Usage_Summary'\n| where timestamp {TimeRange}\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize\n TotalDocuments = count(),\n TotalInputTokens = sum(input_tokens),\n TotalOutputTokens = sum(output_tokens),\n TotalTokens = sum(total_tokens),\n AvgTokensPerDocument = round(avg(total_tokens), 0)", + "size": 4, + "title": "Overall Token Usage Summary", + "queryType": 0, + "resourceType": "microsoft.insights/components", + "visualization": "tiles", + "tileSettings": { + "showBorder": true + } + }, + "name": "overall-summary" + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "customEvents\n| where name == 'LLM_Agent_Token_Usage'\n| where timestamp {TimeRange}\n| extend agent = tostring(customDimensions['agent_name'])\n| extend input_tokens = toint(customDimensions['input_tokens'])\n| extend output_tokens = toint(customDimensions['output_tokens'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize\n InputTokens = sum(input_tokens),\n OutputTokens = sum(output_tokens),\n TotalTokens = sum(total_tokens),\n Invocations = count()\n by Step = agent\n| order by TotalTokens desc", + "size": 0, + "title": "Token Usage by Pipeline Step", + "queryType": 0, + "resourceType": "microsoft.insights/components", + "visualization": "barchart" + }, + "name": "tokens-by-agent" + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "customEvents\n| where name == 'LLM_Token_Usage_Summary'\n| where timestamp {TimeRange}\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\n| summarize InputTokens = sum(input_tokens), OutputTokens = sum(output_tokens) by bin(timestamp, 1h)\n| order by timestamp asc", + "size": 0, + "title": "Token Usage Over Time (Hourly)", + "queryType": 0, + "resourceType": "microsoft.insights/components", + "visualization": "areachart" + }, + "name": "tokens-over-time" + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "customEvents\n| where name == 'LLM_Agent_Token_Usage'\n| where timestamp {TimeRange}\n| extend agent = tostring(customDimensions['agent_name'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize TotalTokens = sum(total_tokens) by agent", + "size": 0, + "title": "Token Distribution by Agent", + "queryType": 0, + "resourceType": "microsoft.insights/components", + "visualization": "piechart" + }, + "name": "token-distribution-pie" + }, + { + "type": 1, + "content": { + "json": "## Cost Estimation" + }, + "name": "section-cost" + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "let input_price_per_million = 2.50;\nlet output_price_per_million = 10.00;\ncustomEvents\n| where name == 'LLM_Token_Usage_Summary'\n| where timestamp {TimeRange}\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\n| summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by bin(timestamp, 1d)\n| extend InputCost = round(TotalInput * input_price_per_million / 1000000.0, 4)\n| extend OutputCost = round(TotalOutput * output_price_per_million / 1000000.0, 4)\n| extend TotalCost = InputCost + OutputCost\n| project Day = timestamp, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost\n| order by Day desc", + "size": 0, + "title": "Estimated Daily Cost (GPT-4o Pricing: $2.50/1M input, $10.00/1M output)", + "queryType": 0, + "resourceType": "microsoft.insights/components", + "visualization": "table" + }, + "name": "cost-estimation" + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "let gpt4o_input = 2.50;\nlet gpt4o_output = 10.00;\nlet gpt4o_mini_input = 0.15;\nlet gpt4o_mini_output = 0.60;\ncustomEvents\n| where name == 'LLM_Model_Token_Usage'\n| where timestamp {TimeRange}\n| extend model = tostring(customDimensions['model_deployment_name'])\n| extend input_tokens = toint(customDimensions['input_tokens'])\n| extend output_tokens = toint(customDimensions['output_tokens'])\n| summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by model\n| extend InputPrice = case(model has \"mini\", gpt4o_mini_input, gpt4o_input)\n| extend OutputPrice = case(model has \"mini\", gpt4o_mini_output, gpt4o_output)\n| extend InputCost = round(TotalInput * InputPrice / 1000000.0, 4)\n| extend OutputCost = round(TotalOutput * OutputPrice / 1000000.0, 4)\n| extend TotalCost = InputCost + OutputCost\n| project Model = model, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost\n| order by TotalCost desc", + "size": 0, + "title": "Estimated Cost by Model", + "queryType": 0, + "resourceType": "microsoft.insights/components", + "visualization": "table" + }, + "name": "cost-by-model" + }, + { + "type": 1, + "content": { + "json": "## Model & Document Details" + }, + "name": "section-details" + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "customEvents\n| where name == 'LLM_Model_Token_Usage'\n| where timestamp {TimeRange}\n| extend model = tostring(customDimensions['model_deployment_name'])\n| extend input_tokens = toint(customDimensions['input_tokens'])\n| extend output_tokens = toint(customDimensions['output_tokens'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize\n InputTokens = sum(input_tokens),\n OutputTokens = sum(output_tokens),\n TotalTokens = sum(total_tokens),\n Invocations = count()\n by Model = model\n| order by TotalTokens desc", + "size": 0, + "title": "Token Usage by Model", + "queryType": 0, + "resourceType": "microsoft.insights/components", + "visualization": "table" + }, + "name": "tokens-by-model" + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "customEvents\n| where name == 'LLM_Agent_Token_Usage'\n| where timestamp {TimeRange}\n| extend agent = tostring(customDimensions['agent_name'])\n| extend model = tostring(customDimensions['model_deployment_name'])\n| extend input_tokens = toint(customDimensions['input_tokens'])\n| extend output_tokens = toint(customDimensions['output_tokens'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize\n InputTokens = sum(input_tokens),\n OutputTokens = sum(output_tokens),\n TotalTokens = sum(total_tokens),\n Invocations = count()\n by Step = agent, Model = model\n| order by TotalTokens desc", + "size": 0, + "title": "Step-to-Model Token Mapping", + "queryType": 0, + "resourceType": "microsoft.insights/components", + "visualization": "table" + }, + "name": "step-model-mapping" + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "customEvents\n| where name == 'LLM_Token_Usage_Summary'\n| where timestamp {TimeRange}\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| extend process_id = tostring(customDimensions['process_id'])\n| extend file_name = tostring(customDimensions['file_name'])\n| summarize TotalTokens = sum(total_tokens) by process_id, file_name\n| order by TotalTokens desc\n| take 20", + "size": 0, + "title": "Top 20 Token Consumers by Document", + "queryType": 0, + "resourceType": "microsoft.insights/components", + "visualization": "table" + }, + "name": "top-consumers" + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "customEvents\n| where name == 'LLM_Token_Usage_Summary'\n| where timestamp {TimeRange}\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\n| extend mime_type = tostring(customDimensions['file_mime_type'])\n| extend file_type = case(\n mime_type has \"pdf\", \"PDF\",\n mime_type has \"image\", \"Image\",\n mime_type has \"word\" or mime_type has \"docx\", \"Word\",\n mime_type has \"excel\" or mime_type has \"xlsx\", \"Excel\",\n mime_type has \"text\", \"Text\",\n \"Other\")\n| summarize\n Documents = count(),\n TotalInputTokens = sum(input_tokens),\n TotalOutputTokens = sum(output_tokens),\n TotalTokens = sum(total_tokens),\n AvgTokensPerDoc = round(avg(total_tokens), 0)\n by FileType = file_type\n| order by TotalTokens desc", + "size": 0, + "title": "Token Usage by File Type", + "queryType": 0, + "resourceType": "microsoft.insights/components", + "visualization": "barchart" + }, + "name": "tokens-by-filetype" + }, + { + "type": 1, + "content": { + "json": "## Percentiles & Trends" + }, + "name": "section-percentiles" + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "customEvents\n| where name == 'LLM_Token_Usage_Summary'\n| where timestamp {TimeRange}\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize\n p50 = percentile(total_tokens, 50),\n p90 = percentile(total_tokens, 90),\n p95 = percentile(total_tokens, 95),\n p99 = percentile(total_tokens, 99),\n Max = max(total_tokens)", + "size": 4, + "title": "Token Usage Percentiles Per Document", + "queryType": 0, + "resourceType": "microsoft.insights/components", + "visualization": "tiles", + "tileSettings": { + "showBorder": true + } + }, + "name": "token-percentiles" + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "customEvents\n| where name == 'LLM_Token_Usage_Summary'\n| where timestamp {TimeRange}\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| extend file_name = tostring(customDimensions['file_name'])\n| summarize\n DocumentsProcessed = count(),\n TotalTokens = sum(total_tokens),\n AvgTokensPerDoc = round(avg(total_tokens), 0),\n MaxTokensPerDoc = max(total_tokens)\n by Day = bin(timestamp, 1d)\n| order by Day desc", + "size": 0, + "title": "Daily Processing Volume with Token Usage", + "queryType": 0, + "resourceType": "microsoft.insights/components", + "visualization": "table" + }, + "name": "daily-volume" + } + ], + "isLocked": false, + "fallbackResourceIds": [ + "Azure Monitor" + ] +} From 902252ab1c23d0af71cdfe4a299587e315a73854 Mon Sep 17 00:00:00 2001 From: Prachig-Microsoft Date: Thu, 14 May 2026 13:13:12 +0530 Subject: [PATCH 04/23] fix: Update workbook template format for Advanced Editor import Removed \ field, added timeContext/timeContextFromParameter, used \r\n in queries, added fromTemplateId for proper template import. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- infra/dashboards/token-usage-workbook.json | 102 +++++++++++++++------ 1 file changed, 76 insertions(+), 26 deletions(-) diff --git a/infra/dashboards/token-usage-workbook.json b/infra/dashboards/token-usage-workbook.json index 367956ad..bfab3f9e 100644 --- a/infra/dashboards/token-usage-workbook.json +++ b/infra/dashboards/token-usage-workbook.json @@ -1,11 +1,10 @@ { "version": "Notebook/1.0", - "$schema": "https://github.com/Microsoft/Application-Insights-Workbooks/blob/master/schema/workbook.json", "items": [ { "type": 1, "content": { - "json": "# LLM Token Usage Dashboard\n\nMonitors token consumption across all pipeline agents in Content Processing Solution Accelerator.\n\n---" + "json": "# LLM Token Usage Dashboard\n\nMonitors token consumption across all pipeline agents in Content Processing Solution Accelerator." }, "name": "title" }, @@ -26,22 +25,25 @@ }, "typeSettings": { "selectableValues": [ - { "durationMs": 3600000 }, - { "durationMs": 86400000 }, - { "durationMs": 604800000 }, - { "durationMs": 2592000000 } - ] + {"durationMs": 3600000}, + {"durationMs": 86400000}, + {"durationMs": 604800000}, + {"durationMs": 2592000000} + ], + "allowCustom": true } } ], - "style": "pills" + "style": "pills", + "queryType": 0, + "resourceType": "microsoft.insights/components" }, "name": "parameters" }, { "type": 1, "content": { - "json": "## Overview" + "json": "---\n## Overview" }, "name": "section-overview" }, @@ -49,9 +51,13 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\n| where name == 'LLM_Token_Usage_Summary'\n| where timestamp {TimeRange}\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize\n TotalDocuments = count(),\n TotalInputTokens = sum(input_tokens),\n TotalOutputTokens = sum(output_tokens),\n TotalTokens = sum(total_tokens),\n AvgTokensPerDocument = round(avg(total_tokens), 0)", + "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\r\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize\r\n TotalDocuments = count(),\r\n TotalInputTokens = sum(input_tokens),\r\n TotalOutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDocument = round(avg(total_tokens), 0)", "size": 4, "title": "Overall Token Usage Summary", + "timeContext": { + "durationMs": 604800000 + }, + "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", "visualization": "tiles", @@ -65,9 +71,13 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\n| where name == 'LLM_Agent_Token_Usage'\n| where timestamp {TimeRange}\n| extend agent = tostring(customDimensions['agent_name'])\n| extend input_tokens = toint(customDimensions['input_tokens'])\n| extend output_tokens = toint(customDimensions['output_tokens'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize\n InputTokens = sum(input_tokens),\n OutputTokens = sum(output_tokens),\n TotalTokens = sum(total_tokens),\n Invocations = count()\n by Step = agent\n| order by TotalTokens desc", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize\r\n InputTokens = sum(input_tokens),\r\n OutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n Invocations = count()\r\n by Step = agent\r\n| order by TotalTokens desc", "size": 0, "title": "Token Usage by Pipeline Step", + "timeContext": { + "durationMs": 604800000 + }, + "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", "visualization": "barchart" @@ -78,9 +88,13 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\n| where name == 'LLM_Token_Usage_Summary'\n| where timestamp {TimeRange}\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\n| summarize InputTokens = sum(input_tokens), OutputTokens = sum(output_tokens) by bin(timestamp, 1h)\n| order by timestamp asc", + "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\r\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\r\n| summarize InputTokens = sum(input_tokens), OutputTokens = sum(output_tokens) by bin(timestamp, 1h)\r\n| order by timestamp asc", "size": 0, "title": "Token Usage Over Time (Hourly)", + "timeContext": { + "durationMs": 604800000 + }, + "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", "visualization": "areachart" @@ -91,9 +105,13 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\n| where name == 'LLM_Agent_Token_Usage'\n| where timestamp {TimeRange}\n| extend agent = tostring(customDimensions['agent_name'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize TotalTokens = sum(total_tokens) by agent", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize TotalTokens = sum(total_tokens) by agent", "size": 0, "title": "Token Distribution by Agent", + "timeContext": { + "durationMs": 604800000 + }, + "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", "visualization": "piechart" @@ -103,7 +121,7 @@ { "type": 1, "content": { - "json": "## Cost Estimation" + "json": "---\n## Cost Estimation" }, "name": "section-cost" }, @@ -111,9 +129,13 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "let input_price_per_million = 2.50;\nlet output_price_per_million = 10.00;\ncustomEvents\n| where name == 'LLM_Token_Usage_Summary'\n| where timestamp {TimeRange}\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\n| summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by bin(timestamp, 1d)\n| extend InputCost = round(TotalInput * input_price_per_million / 1000000.0, 4)\n| extend OutputCost = round(TotalOutput * output_price_per_million / 1000000.0, 4)\n| extend TotalCost = InputCost + OutputCost\n| project Day = timestamp, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost\n| order by Day desc", + "query": "let input_price_per_million = 2.50;\r\nlet output_price_per_million = 10.00;\r\ncustomEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\r\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\r\n| summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by bin(timestamp, 1d)\r\n| extend InputCost = round(TotalInput * input_price_per_million / 1000000.0, 4)\r\n| extend OutputCost = round(TotalOutput * output_price_per_million / 1000000.0, 4)\r\n| extend TotalCost = InputCost + OutputCost\r\n| project Day = timestamp, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost\r\n| order by Day desc", "size": 0, "title": "Estimated Daily Cost (GPT-4o Pricing: $2.50/1M input, $10.00/1M output)", + "timeContext": { + "durationMs": 2592000000 + }, + "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", "visualization": "table" @@ -124,9 +146,13 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "let gpt4o_input = 2.50;\nlet gpt4o_output = 10.00;\nlet gpt4o_mini_input = 0.15;\nlet gpt4o_mini_output = 0.60;\ncustomEvents\n| where name == 'LLM_Model_Token_Usage'\n| where timestamp {TimeRange}\n| extend model = tostring(customDimensions['model_deployment_name'])\n| extend input_tokens = toint(customDimensions['input_tokens'])\n| extend output_tokens = toint(customDimensions['output_tokens'])\n| summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by model\n| extend InputPrice = case(model has \"mini\", gpt4o_mini_input, gpt4o_input)\n| extend OutputPrice = case(model has \"mini\", gpt4o_mini_output, gpt4o_output)\n| extend InputCost = round(TotalInput * InputPrice / 1000000.0, 4)\n| extend OutputCost = round(TotalOutput * OutputPrice / 1000000.0, 4)\n| extend TotalCost = InputCost + OutputCost\n| project Model = model, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost\n| order by TotalCost desc", + "query": "let gpt4o_input = 2.50;\r\nlet gpt4o_output = 10.00;\r\nlet gpt4o_mini_input = 0.15;\r\nlet gpt4o_mini_output = 0.60;\r\ncustomEvents\r\n| where name == 'LLM_Model_Token_Usage'\r\n| extend model = tostring(customDimensions['model_deployment_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by model\r\n| extend InputPrice = case(model has \"mini\", gpt4o_mini_input, gpt4o_input)\r\n| extend OutputPrice = case(model has \"mini\", gpt4o_mini_output, gpt4o_output)\r\n| extend InputCost = round(TotalInput * InputPrice / 1000000.0, 4)\r\n| extend OutputCost = round(TotalOutput * OutputPrice / 1000000.0, 4)\r\n| extend TotalCost = InputCost + OutputCost\r\n| project Model = model, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost\r\n| order by TotalCost desc", "size": 0, "title": "Estimated Cost by Model", + "timeContext": { + "durationMs": 2592000000 + }, + "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", "visualization": "table" @@ -136,7 +162,7 @@ { "type": 1, "content": { - "json": "## Model & Document Details" + "json": "---\n## Model & Document Details" }, "name": "section-details" }, @@ -144,9 +170,13 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\n| where name == 'LLM_Model_Token_Usage'\n| where timestamp {TimeRange}\n| extend model = tostring(customDimensions['model_deployment_name'])\n| extend input_tokens = toint(customDimensions['input_tokens'])\n| extend output_tokens = toint(customDimensions['output_tokens'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize\n InputTokens = sum(input_tokens),\n OutputTokens = sum(output_tokens),\n TotalTokens = sum(total_tokens),\n Invocations = count()\n by Model = model\n| order by TotalTokens desc", + "query": "customEvents\r\n| where name == 'LLM_Model_Token_Usage'\r\n| extend model = tostring(customDimensions['model_deployment_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize\r\n InputTokens = sum(input_tokens),\r\n OutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n Invocations = count()\r\n by Model = model\r\n| order by TotalTokens desc", "size": 0, "title": "Token Usage by Model", + "timeContext": { + "durationMs": 604800000 + }, + "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", "visualization": "table" @@ -157,9 +187,13 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\n| where name == 'LLM_Agent_Token_Usage'\n| where timestamp {TimeRange}\n| extend agent = tostring(customDimensions['agent_name'])\n| extend model = tostring(customDimensions['model_deployment_name'])\n| extend input_tokens = toint(customDimensions['input_tokens'])\n| extend output_tokens = toint(customDimensions['output_tokens'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize\n InputTokens = sum(input_tokens),\n OutputTokens = sum(output_tokens),\n TotalTokens = sum(total_tokens),\n Invocations = count()\n by Step = agent, Model = model\n| order by TotalTokens desc", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend model = tostring(customDimensions['model_deployment_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize\r\n InputTokens = sum(input_tokens),\r\n OutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n Invocations = count()\r\n by Step = agent, Model = model\r\n| order by TotalTokens desc", "size": 0, "title": "Step-to-Model Token Mapping", + "timeContext": { + "durationMs": 604800000 + }, + "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", "visualization": "table" @@ -170,9 +204,13 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\n| where name == 'LLM_Token_Usage_Summary'\n| where timestamp {TimeRange}\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| extend process_id = tostring(customDimensions['process_id'])\n| extend file_name = tostring(customDimensions['file_name'])\n| summarize TotalTokens = sum(total_tokens) by process_id, file_name\n| order by TotalTokens desc\n| take 20", + "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend file_name = tostring(customDimensions['file_name'])\r\n| summarize TotalTokens = sum(total_tokens) by process_id, file_name\r\n| order by TotalTokens desc\r\n| take 20", "size": 0, "title": "Top 20 Token Consumers by Document", + "timeContext": { + "durationMs": 604800000 + }, + "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", "visualization": "table" @@ -183,9 +221,13 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\n| where name == 'LLM_Token_Usage_Summary'\n| where timestamp {TimeRange}\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\n| extend mime_type = tostring(customDimensions['file_mime_type'])\n| extend file_type = case(\n mime_type has \"pdf\", \"PDF\",\n mime_type has \"image\", \"Image\",\n mime_type has \"word\" or mime_type has \"docx\", \"Word\",\n mime_type has \"excel\" or mime_type has \"xlsx\", \"Excel\",\n mime_type has \"text\", \"Text\",\n \"Other\")\n| summarize\n Documents = count(),\n TotalInputTokens = sum(input_tokens),\n TotalOutputTokens = sum(output_tokens),\n TotalTokens = sum(total_tokens),\n AvgTokensPerDoc = round(avg(total_tokens), 0)\n by FileType = file_type\n| order by TotalTokens desc", + "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\r\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\r\n| extend mime_type = tostring(customDimensions['file_mime_type'])\r\n| extend file_type = case(\r\n mime_type has \"pdf\", \"PDF\",\r\n mime_type has \"image\", \"Image\",\r\n mime_type has \"word\" or mime_type has \"docx\", \"Word\",\r\n mime_type has \"excel\" or mime_type has \"xlsx\", \"Excel\",\r\n mime_type has \"text\", \"Text\",\r\n \"Other\")\r\n| summarize\r\n Documents = count(),\r\n TotalInputTokens = sum(input_tokens),\r\n TotalOutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDoc = round(avg(total_tokens), 0)\r\n by FileType = file_type\r\n| order by TotalTokens desc", "size": 0, "title": "Token Usage by File Type", + "timeContext": { + "durationMs": 604800000 + }, + "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", "visualization": "barchart" @@ -195,7 +237,7 @@ { "type": 1, "content": { - "json": "## Percentiles & Trends" + "json": "---\n## Percentiles & Trends" }, "name": "section-percentiles" }, @@ -203,9 +245,13 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\n| where name == 'LLM_Token_Usage_Summary'\n| where timestamp {TimeRange}\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize\n p50 = percentile(total_tokens, 50),\n p90 = percentile(total_tokens, 90),\n p95 = percentile(total_tokens, 95),\n p99 = percentile(total_tokens, 99),\n Max = max(total_tokens)", + "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize\r\n p50 = percentile(total_tokens, 50),\r\n p90 = percentile(total_tokens, 90),\r\n p95 = percentile(total_tokens, 95),\r\n p99 = percentile(total_tokens, 99),\r\n Max = max(total_tokens)", "size": 4, "title": "Token Usage Percentiles Per Document", + "timeContext": { + "durationMs": 604800000 + }, + "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", "visualization": "tiles", @@ -219,9 +265,13 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\n| where name == 'LLM_Token_Usage_Summary'\n| where timestamp {TimeRange}\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| extend file_name = tostring(customDimensions['file_name'])\n| summarize\n DocumentsProcessed = count(),\n TotalTokens = sum(total_tokens),\n AvgTokensPerDoc = round(avg(total_tokens), 0),\n MaxTokensPerDoc = max(total_tokens)\n by Day = bin(timestamp, 1d)\n| order by Day desc", + "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| extend file_name = tostring(customDimensions['file_name'])\r\n| summarize\r\n DocumentsProcessed = count(),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDoc = round(avg(total_tokens), 0),\r\n MaxTokensPerDoc = max(total_tokens)\r\n by Day = bin(timestamp, 1d)\r\n| order by Day desc", "size": 0, "title": "Daily Processing Volume with Token Usage", + "timeContext": { + "durationMs": 2592000000 + }, + "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", "visualization": "table" @@ -229,8 +279,8 @@ "name": "daily-volume" } ], - "isLocked": false, "fallbackResourceIds": [ "Azure Monitor" - ] + ], + "fromTemplateId": "community-Workbooks/Common/Templates" } From bd3f1e0f578231af1da1c3785d3f1f3e197f546f Mon Sep 17 00:00:00 2001 From: Prachig-Microsoft Date: Thu, 14 May 2026 13:27:32 +0530 Subject: [PATCH 05/23] fix: Replace tiles visualization with table to fix workbook import error Changed 'tiles' visualization to 'table' for summary and percentiles items to resolve 'undefined (Unknown)' error in workbook Advanced Editor. Kept fallbackResourceIds as 'Azure Monitor' for portability. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- infra/dashboards/token-usage-workbook.json | 50 +++++++++++++--------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/infra/dashboards/token-usage-workbook.json b/infra/dashboards/token-usage-workbook.json index bfab3f9e..0776f870 100644 --- a/infra/dashboards/token-usage-workbook.json +++ b/infra/dashboards/token-usage-workbook.json @@ -51,8 +51,8 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\r\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize\r\n TotalDocuments = count(),\r\n TotalInputTokens = sum(input_tokens),\r\n TotalOutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDocument = round(avg(total_tokens), 0)", - "size": 4, + "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\r\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize\r\n TotalDocuments = count(),\r\n TotalInputTokens = sum(input_tokens),\r\n TotalOutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDocument = round(avg(total_tokens), 0)", + "size": 3, "title": "Overall Token Usage Summary", "timeContext": { "durationMs": 604800000 @@ -60,9 +60,14 @@ "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", - "visualization": "tiles", - "tileSettings": { - "showBorder": true + "visualization": "table", + "gridSettings": { + "formatters": [ + { + "columnMatch": "TotalTokens", + "formatter": 1 + } + ] } }, "name": "overall-summary" @@ -71,7 +76,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize\r\n InputTokens = sum(input_tokens),\r\n OutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n Invocations = count()\r\n by Step = agent\r\n| order by TotalTokens desc", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize\r\n InputTokens = sum(input_tokens),\r\n OutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n Invocations = count()\r\n by Step = agent\r\n| order by TotalTokens desc", "size": 0, "title": "Token Usage by Pipeline Step", "timeContext": { @@ -88,7 +93,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\r\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\r\n| summarize InputTokens = sum(input_tokens), OutputTokens = sum(output_tokens) by bin(timestamp, 1h)\r\n| order by timestamp asc", + "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\r\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\r\n| summarize InputTokens = sum(input_tokens), OutputTokens = sum(output_tokens) by bin(timestamp, 1h)\r\n| order by timestamp asc", "size": 0, "title": "Token Usage Over Time (Hourly)", "timeContext": { @@ -105,7 +110,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize TotalTokens = sum(total_tokens) by agent", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize TotalTokens = sum(total_tokens) by agent", "size": 0, "title": "Token Distribution by Agent", "timeContext": { @@ -129,7 +134,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "let input_price_per_million = 2.50;\r\nlet output_price_per_million = 10.00;\r\ncustomEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\r\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\r\n| summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by bin(timestamp, 1d)\r\n| extend InputCost = round(TotalInput * input_price_per_million / 1000000.0, 4)\r\n| extend OutputCost = round(TotalOutput * output_price_per_million / 1000000.0, 4)\r\n| extend TotalCost = InputCost + OutputCost\r\n| project Day = timestamp, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost\r\n| order by Day desc", + "query": "let input_price_per_million = 2.50;\r\nlet output_price_per_million = 10.00;\r\ncustomEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\r\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\r\n| summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by bin(timestamp, 1d)\r\n| extend InputCost = round(TotalInput * input_price_per_million / 1000000.0, 4)\r\n| extend OutputCost = round(TotalOutput * output_price_per_million / 1000000.0, 4)\r\n| extend TotalCost = InputCost + OutputCost\r\n| project Day = timestamp, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost\r\n| order by Day desc", "size": 0, "title": "Estimated Daily Cost (GPT-4o Pricing: $2.50/1M input, $10.00/1M output)", "timeContext": { @@ -146,7 +151,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "let gpt4o_input = 2.50;\r\nlet gpt4o_output = 10.00;\r\nlet gpt4o_mini_input = 0.15;\r\nlet gpt4o_mini_output = 0.60;\r\ncustomEvents\r\n| where name == 'LLM_Model_Token_Usage'\r\n| extend model = tostring(customDimensions['model_deployment_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by model\r\n| extend InputPrice = case(model has \"mini\", gpt4o_mini_input, gpt4o_input)\r\n| extend OutputPrice = case(model has \"mini\", gpt4o_mini_output, gpt4o_output)\r\n| extend InputCost = round(TotalInput * InputPrice / 1000000.0, 4)\r\n| extend OutputCost = round(TotalOutput * OutputPrice / 1000000.0, 4)\r\n| extend TotalCost = InputCost + OutputCost\r\n| project Model = model, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost\r\n| order by TotalCost desc", + "query": "let gpt4o_input = 2.50;\r\nlet gpt4o_output = 10.00;\r\nlet gpt4o_mini_input = 0.15;\r\nlet gpt4o_mini_output = 0.60;\r\ncustomEvents\r\n| where name == 'LLM_Model_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend model = tostring(customDimensions['model_deployment_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by model\r\n| extend InputPrice = case(model has \"mini\", gpt4o_mini_input, gpt4o_input)\r\n| extend OutputPrice = case(model has \"mini\", gpt4o_mini_output, gpt4o_output)\r\n| extend InputCost = round(TotalInput * InputPrice / 1000000.0, 4)\r\n| extend OutputCost = round(TotalOutput * OutputPrice / 1000000.0, 4)\r\n| extend TotalCost = InputCost + OutputCost\r\n| project Model = model, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost\r\n| order by TotalCost desc", "size": 0, "title": "Estimated Cost by Model", "timeContext": { @@ -170,7 +175,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Model_Token_Usage'\r\n| extend model = tostring(customDimensions['model_deployment_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize\r\n InputTokens = sum(input_tokens),\r\n OutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n Invocations = count()\r\n by Model = model\r\n| order by TotalTokens desc", + "query": "customEvents\r\n| where name == 'LLM_Model_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend model = tostring(customDimensions['model_deployment_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize\r\n InputTokens = sum(input_tokens),\r\n OutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n Invocations = count()\r\n by Model = model\r\n| order by TotalTokens desc", "size": 0, "title": "Token Usage by Model", "timeContext": { @@ -187,7 +192,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend model = tostring(customDimensions['model_deployment_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize\r\n InputTokens = sum(input_tokens),\r\n OutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n Invocations = count()\r\n by Step = agent, Model = model\r\n| order by TotalTokens desc", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend model = tostring(customDimensions['model_deployment_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize\r\n InputTokens = sum(input_tokens),\r\n OutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n Invocations = count()\r\n by Step = agent, Model = model\r\n| order by TotalTokens desc", "size": 0, "title": "Step-to-Model Token Mapping", "timeContext": { @@ -204,7 +209,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend file_name = tostring(customDimensions['file_name'])\r\n| summarize TotalTokens = sum(total_tokens) by process_id, file_name\r\n| order by TotalTokens desc\r\n| take 20", + "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend file_name = tostring(customDimensions['file_name'])\r\n| summarize TotalTokens = sum(total_tokens) by process_id, file_name\r\n| order by TotalTokens desc\r\n| take 20", "size": 0, "title": "Top 20 Token Consumers by Document", "timeContext": { @@ -221,7 +226,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\r\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\r\n| extend mime_type = tostring(customDimensions['file_mime_type'])\r\n| extend file_type = case(\r\n mime_type has \"pdf\", \"PDF\",\r\n mime_type has \"image\", \"Image\",\r\n mime_type has \"word\" or mime_type has \"docx\", \"Word\",\r\n mime_type has \"excel\" or mime_type has \"xlsx\", \"Excel\",\r\n mime_type has \"text\", \"Text\",\r\n \"Other\")\r\n| summarize\r\n Documents = count(),\r\n TotalInputTokens = sum(input_tokens),\r\n TotalOutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDoc = round(avg(total_tokens), 0)\r\n by FileType = file_type\r\n| order by TotalTokens desc", + "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\r\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\r\n| extend mime_type = tostring(customDimensions['file_mime_type'])\r\n| extend file_type = case(\r\n mime_type has \"pdf\", \"PDF\",\r\n mime_type has \"image\", \"Image\",\r\n mime_type has \"word\" or mime_type has \"docx\", \"Word\",\r\n mime_type has \"excel\" or mime_type has \"xlsx\", \"Excel\",\r\n mime_type has \"text\", \"Text\",\r\n \"Other\")\r\n| summarize\r\n Documents = count(),\r\n TotalInputTokens = sum(input_tokens),\r\n TotalOutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDoc = round(avg(total_tokens), 0)\r\n by FileType = file_type\r\n| order by TotalTokens desc", "size": 0, "title": "Token Usage by File Type", "timeContext": { @@ -245,8 +250,8 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize\r\n p50 = percentile(total_tokens, 50),\r\n p90 = percentile(total_tokens, 90),\r\n p95 = percentile(total_tokens, 95),\r\n p99 = percentile(total_tokens, 99),\r\n Max = max(total_tokens)", - "size": 4, + "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize\r\n p50 = percentile(total_tokens, 50),\r\n p90 = percentile(total_tokens, 90),\r\n p95 = percentile(total_tokens, 95),\r\n p99 = percentile(total_tokens, 99),\r\n Max = max(total_tokens)", + "size": 3, "title": "Token Usage Percentiles Per Document", "timeContext": { "durationMs": 604800000 @@ -254,9 +259,14 @@ "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", - "visualization": "tiles", - "tileSettings": { - "showBorder": true + "visualization": "table", + "gridSettings": { + "formatters": [ + { + "columnMatch": "p50|p90|p95|p99|Max", + "formatter": 1 + } + ] } }, "name": "token-percentiles" @@ -265,7 +275,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| extend file_name = tostring(customDimensions['file_name'])\r\n| summarize\r\n DocumentsProcessed = count(),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDoc = round(avg(total_tokens), 0),\r\n MaxTokensPerDoc = max(total_tokens)\r\n by Day = bin(timestamp, 1d)\r\n| order by Day desc", + "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| extend file_name = tostring(customDimensions['file_name'])\r\n| summarize\r\n DocumentsProcessed = count(),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDoc = round(avg(total_tokens), 0),\r\n MaxTokensPerDoc = max(total_tokens)\r\n by Day = bin(timestamp, 1d)\r\n| order by Day desc", "size": 0, "title": "Daily Processing Volume with Token Usage", "timeContext": { From 385218457921fafeee01269c838d19a7ffd06111 Mon Sep 17 00:00:00 2001 From: Prachig-Microsoft Date: Thu, 14 May 2026 14:04:09 +0530 Subject: [PATCH 06/23] revert: Remove enableMonitoring from main.parameters.json This was a deployment-specific change that should not be in the feature branch. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- infra/main.parameters.json | 3 --- 1 file changed, 3 deletions(-) diff --git a/infra/main.parameters.json b/infra/main.parameters.json index d20510b0..44153d57 100644 --- a/infra/main.parameters.json +++ b/infra/main.parameters.json @@ -29,9 +29,6 @@ "existingFoundryProjectResourceId": { "value": "${AZURE_EXISTING_AIPROJECT_RESOURCE_ID}" }, - "enableMonitoring": { - "value": true - }, "containerRegistryEndpoint": { "value": "${AZURE_ENV_CONTAINER_REGISTRY_ENDPOINT}" }, From 55eadb387df7e4412923af1ef83bcb87b6ebfa4b Mon Sep 17 00:00:00 2001 From: Prachig-Microsoft Date: Thu, 14 May 2026 17:11:29 +0530 Subject: [PATCH 07/23] fix: Deduplicate KQL queries in workbook to prevent double-counting events Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- infra/dashboards/token-usage-workbook.json | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/infra/dashboards/token-usage-workbook.json b/infra/dashboards/token-usage-workbook.json index 0776f870..8192f908 100644 --- a/infra/dashboards/token-usage-workbook.json +++ b/infra/dashboards/token-usage-workbook.json @@ -51,7 +51,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\r\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize\r\n TotalDocuments = count(),\r\n TotalInputTokens = sum(input_tokens),\r\n TotalOutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDocument = round(avg(total_tokens), 0)", + "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\r\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by process_id\r\n| summarize\r\n TotalDocuments = count(),\r\n TotalInputTokens = sum(input_tokens),\r\n TotalOutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDocument = round(avg(total_tokens), 0)", "size": 3, "title": "Overall Token Usage Summary", "timeContext": { @@ -76,7 +76,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize\r\n InputTokens = sum(input_tokens),\r\n OutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n Invocations = count()\r\n by Step = agent\r\n| order by TotalTokens desc", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by agent, process_id\r\n| summarize\r\n InputTokens = sum(input_tokens),\r\n OutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n Invocations = count()\r\n by Step = agent\r\n| order by TotalTokens desc", "size": 0, "title": "Token Usage by Pipeline Step", "timeContext": { @@ -93,7 +93,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\r\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\r\n| summarize InputTokens = sum(input_tokens), OutputTokens = sum(output_tokens) by bin(timestamp, 1h)\r\n| order by timestamp asc", + "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\r\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), timestamp=min(timestamp) by process_id\r\n| summarize InputTokens = sum(input_tokens), OutputTokens = sum(output_tokens) by bin(timestamp, 1h)\r\n| order by timestamp asc", "size": 0, "title": "Token Usage Over Time (Hourly)", "timeContext": { @@ -110,7 +110,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize TotalTokens = sum(total_tokens) by agent", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize total_tokens=max(total_tokens) by agent, process_id\r\n| summarize TotalTokens = sum(total_tokens) by agent", "size": 0, "title": "Token Distribution by Agent", "timeContext": { @@ -134,7 +134,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "let input_price_per_million = 2.50;\r\nlet output_price_per_million = 10.00;\r\ncustomEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\r\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\r\n| summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by bin(timestamp, 1d)\r\n| extend InputCost = round(TotalInput * input_price_per_million / 1000000.0, 4)\r\n| extend OutputCost = round(TotalOutput * output_price_per_million / 1000000.0, 4)\r\n| extend TotalCost = InputCost + OutputCost\r\n| project Day = timestamp, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost\r\n| order by Day desc", + "query": "let input_price_per_million = 2.50;\r\nlet output_price_per_million = 10.00;\r\ncustomEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\r\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), timestamp=min(timestamp) by process_id\r\n| summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by bin(timestamp, 1d)\r\n| extend InputCost = round(TotalInput * input_price_per_million / 1000000.0, 4)\r\n| extend OutputCost = round(TotalOutput * output_price_per_million / 1000000.0, 4)\r\n| extend TotalCost = InputCost + OutputCost\r\n| project Day = timestamp, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost\r\n| order by Day desc", "size": 0, "title": "Estimated Daily Cost (GPT-4o Pricing: $2.50/1M input, $10.00/1M output)", "timeContext": { @@ -209,7 +209,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend file_name = tostring(customDimensions['file_name'])\r\n| summarize TotalTokens = sum(total_tokens) by process_id, file_name\r\n| order by TotalTokens desc\r\n| take 20", + "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend file_name = tostring(customDimensions['file_name'])\r\n| summarize TotalTokens = max(total_tokens) by process_id, file_name\r\n| order by TotalTokens desc\r\n| take 20", "size": 0, "title": "Top 20 Token Consumers by Document", "timeContext": { @@ -226,7 +226,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\r\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\r\n| extend mime_type = tostring(customDimensions['file_mime_type'])\r\n| extend file_type = case(\r\n mime_type has \"pdf\", \"PDF\",\r\n mime_type has \"image\", \"Image\",\r\n mime_type has \"word\" or mime_type has \"docx\", \"Word\",\r\n mime_type has \"excel\" or mime_type has \"xlsx\", \"Excel\",\r\n mime_type has \"text\", \"Text\",\r\n \"Other\")\r\n| summarize\r\n Documents = count(),\r\n TotalInputTokens = sum(input_tokens),\r\n TotalOutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDoc = round(avg(total_tokens), 0)\r\n by FileType = file_type\r\n| order by TotalTokens desc", + "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\r\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\r\n| extend mime_type = tostring(customDimensions['file_mime_type'])\r\n| extend file_type = case(\r\n mime_type has \"pdf\", \"PDF\",\r\n mime_type has \"image\", \"Image\",\r\n mime_type has \"word\" or mime_type has \"docx\", \"Word\",\r\n mime_type has \"excel\" or mime_type has \"xlsx\", \"Excel\",\r\n mime_type has \"text\", \"Text\",\r\n \"Other\")\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens), file_type=take_any(file_type) by process_id\r\n| summarize\r\n Documents = count(),\r\n TotalInputTokens = sum(input_tokens),\r\n TotalOutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDoc = round(avg(total_tokens), 0)\r\n by FileType = file_type\r\n| order by TotalTokens desc", "size": 0, "title": "Token Usage by File Type", "timeContext": { @@ -250,7 +250,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize\r\n p50 = percentile(total_tokens, 50),\r\n p90 = percentile(total_tokens, 90),\r\n p95 = percentile(total_tokens, 95),\r\n p99 = percentile(total_tokens, 99),\r\n Max = max(total_tokens)", + "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize total_tokens=max(total_tokens) by process_id\r\n| summarize\r\n p50 = percentile(total_tokens, 50),\r\n p90 = percentile(total_tokens, 90),\r\n p95 = percentile(total_tokens, 95),\r\n p99 = percentile(total_tokens, 99),\r\n Max = max(total_tokens)", "size": 3, "title": "Token Usage Percentiles Per Document", "timeContext": { @@ -275,7 +275,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| extend file_name = tostring(customDimensions['file_name'])\r\n| summarize\r\n DocumentsProcessed = count(),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDoc = round(avg(total_tokens), 0),\r\n MaxTokensPerDoc = max(total_tokens)\r\n by Day = bin(timestamp, 1d)\r\n| order by Day desc", + "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize total_tokens=max(total_tokens), timestamp=min(timestamp) by process_id\r\n| summarize\r\n DocumentsProcessed = count(),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDoc = round(avg(total_tokens), 0),\r\n MaxTokensPerDoc = max(total_tokens)\r\n by Day = bin(timestamp, 1d)\r\n| order by Day desc", "size": 0, "title": "Daily Processing Volume with Token Usage", "timeContext": { From 6407673fc81c1e5483b7ecc28d6f69a45d1cf7a0 Mon Sep 17 00:00:00 2001 From: Prachig-Microsoft Date: Fri, 15 May 2026 16:05:59 +0530 Subject: [PATCH 08/23] feat: Update workbook with grid layout and add deploy-workbook.bicep - Updated token-usage-workbook.json with grid layout for graphs - Added deploy-workbook.bicep for standalone workbook deployment to any RG Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- infra/dashboards/deploy-workbook.bicep | 27 +++++++++ infra/dashboards/token-usage-workbook.json | 66 ++++++++-------------- 2 files changed, 49 insertions(+), 44 deletions(-) create mode 100644 infra/dashboards/deploy-workbook.bicep diff --git a/infra/dashboards/deploy-workbook.bicep b/infra/dashboards/deploy-workbook.bicep new file mode 100644 index 00000000..9d030a56 --- /dev/null +++ b/infra/dashboards/deploy-workbook.bicep @@ -0,0 +1,27 @@ +// Standalone deployment for LLM Token Usage Workbook +// Connects to an existing Application Insights instance from any content processing RG + +targetScope = 'resourceGroup' + +@description('Full resource ID of the Application Insights instance to query.') +param appInsightsResourceId string + +@description('Azure region for the workbook resource.') +param location string = resourceGroup().location + +var workbookId = guid(resourceGroup().id, 'token-usage-workbook') + +resource workbook 'Microsoft.Insights/workbooks@2022-04-01' = { + name: workbookId + location: location + kind: 'shared' + properties: { + displayName: 'LLM Token Usage Dashboard' + category: 'workbook' + sourceId: appInsightsResourceId + serializedData: loadTextContent('token-usage-workbook.json') + } +} + +output workbookName string = workbook.name +output workbookId string = workbook.id diff --git a/infra/dashboards/token-usage-workbook.json b/infra/dashboards/token-usage-workbook.json index 8192f908..3be53340 100644 --- a/infra/dashboards/token-usage-workbook.json +++ b/infra/dashboards/token-usage-workbook.json @@ -25,10 +25,18 @@ }, "typeSettings": { "selectableValues": [ - {"durationMs": 3600000}, - {"durationMs": 86400000}, - {"durationMs": 604800000}, - {"durationMs": 2592000000} + { + "durationMs": 3600000 + }, + { + "durationMs": 86400000 + }, + { + "durationMs": 604800000 + }, + { + "durationMs": 2592000000 + } ], "allowCustom": true } @@ -54,9 +62,6 @@ "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\r\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by process_id\r\n| summarize\r\n TotalDocuments = count(),\r\n TotalInputTokens = sum(input_tokens),\r\n TotalOutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDocument = round(avg(total_tokens), 0)", "size": 3, "title": "Overall Token Usage Summary", - "timeContext": { - "durationMs": 604800000 - }, "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", @@ -76,16 +81,13 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by agent, process_id\r\n| summarize\r\n InputTokens = sum(input_tokens),\r\n OutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n Invocations = count()\r\n by Step = agent\r\n| order by TotalTokens desc", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by agent, process_id\r\n| summarize\r\n InputTokens = sum(input_tokens),\r\n OutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n Invocations = count()\r\n by Step = agent\r\n| project Step, InputTokens, OutputTokens, TotalTokens, Invocations\r\n| order by TotalTokens desc", "size": 0, "title": "Token Usage by Pipeline Step", - "timeContext": { - "durationMs": 604800000 - }, "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", - "visualization": "barchart" + "visualization": "table" }, "name": "tokens-by-agent" }, @@ -96,9 +98,6 @@ "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\r\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), timestamp=min(timestamp) by process_id\r\n| summarize InputTokens = sum(input_tokens), OutputTokens = sum(output_tokens) by bin(timestamp, 1h)\r\n| order by timestamp asc", "size": 0, "title": "Token Usage Over Time (Hourly)", - "timeContext": { - "durationMs": 604800000 - }, "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", @@ -113,9 +112,6 @@ "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize total_tokens=max(total_tokens) by agent, process_id\r\n| summarize TotalTokens = sum(total_tokens) by agent", "size": 0, "title": "Token Distribution by Agent", - "timeContext": { - "durationMs": 604800000 - }, "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", @@ -137,9 +133,6 @@ "query": "let input_price_per_million = 2.50;\r\nlet output_price_per_million = 10.00;\r\ncustomEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\r\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), timestamp=min(timestamp) by process_id\r\n| summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by bin(timestamp, 1d)\r\n| extend InputCost = round(TotalInput * input_price_per_million / 1000000.0, 4)\r\n| extend OutputCost = round(TotalOutput * output_price_per_million / 1000000.0, 4)\r\n| extend TotalCost = InputCost + OutputCost\r\n| project Day = timestamp, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost\r\n| order by Day desc", "size": 0, "title": "Estimated Daily Cost (GPT-4o Pricing: $2.50/1M input, $10.00/1M output)", - "timeContext": { - "durationMs": 2592000000 - }, "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", @@ -154,9 +147,6 @@ "query": "let gpt4o_input = 2.50;\r\nlet gpt4o_output = 10.00;\r\nlet gpt4o_mini_input = 0.15;\r\nlet gpt4o_mini_output = 0.60;\r\ncustomEvents\r\n| where name == 'LLM_Model_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend model = tostring(customDimensions['model_deployment_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by model\r\n| extend InputPrice = case(model has \"mini\", gpt4o_mini_input, gpt4o_input)\r\n| extend OutputPrice = case(model has \"mini\", gpt4o_mini_output, gpt4o_output)\r\n| extend InputCost = round(TotalInput * InputPrice / 1000000.0, 4)\r\n| extend OutputCost = round(TotalOutput * OutputPrice / 1000000.0, 4)\r\n| extend TotalCost = InputCost + OutputCost\r\n| project Model = model, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost\r\n| order by TotalCost desc", "size": 0, "title": "Estimated Cost by Model", - "timeContext": { - "durationMs": 2592000000 - }, "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", @@ -178,9 +168,6 @@ "query": "customEvents\r\n| where name == 'LLM_Model_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend model = tostring(customDimensions['model_deployment_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize\r\n InputTokens = sum(input_tokens),\r\n OutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n Invocations = count()\r\n by Model = model\r\n| order by TotalTokens desc", "size": 0, "title": "Token Usage by Model", - "timeContext": { - "durationMs": 604800000 - }, "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", @@ -195,9 +182,6 @@ "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend model = tostring(customDimensions['model_deployment_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize\r\n InputTokens = sum(input_tokens),\r\n OutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n Invocations = count()\r\n by Step = agent, Model = model\r\n| order by TotalTokens desc", "size": 0, "title": "Step-to-Model Token Mapping", - "timeContext": { - "durationMs": 604800000 - }, "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", @@ -212,9 +196,6 @@ "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend file_name = tostring(customDimensions['file_name'])\r\n| summarize TotalTokens = max(total_tokens) by process_id, file_name\r\n| order by TotalTokens desc\r\n| take 20", "size": 0, "title": "Top 20 Token Consumers by Document", - "timeContext": { - "durationMs": 604800000 - }, "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", @@ -229,9 +210,6 @@ "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\r\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\r\n| extend mime_type = tostring(customDimensions['file_mime_type'])\r\n| extend file_type = case(\r\n mime_type has \"pdf\", \"PDF\",\r\n mime_type has \"image\", \"Image\",\r\n mime_type has \"word\" or mime_type has \"docx\", \"Word\",\r\n mime_type has \"excel\" or mime_type has \"xlsx\", \"Excel\",\r\n mime_type has \"text\", \"Text\",\r\n \"Other\")\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens), file_type=take_any(file_type) by process_id\r\n| summarize\r\n Documents = count(),\r\n TotalInputTokens = sum(input_tokens),\r\n TotalOutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDoc = round(avg(total_tokens), 0)\r\n by FileType = file_type\r\n| order by TotalTokens desc", "size": 0, "title": "Token Usage by File Type", - "timeContext": { - "durationMs": 604800000 - }, "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", @@ -253,9 +231,6 @@ "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize total_tokens=max(total_tokens) by process_id\r\n| summarize\r\n p50 = percentile(total_tokens, 50),\r\n p90 = percentile(total_tokens, 90),\r\n p95 = percentile(total_tokens, 95),\r\n p99 = percentile(total_tokens, 99),\r\n Max = max(total_tokens)", "size": 3, "title": "Token Usage Percentiles Per Document", - "timeContext": { - "durationMs": 604800000 - }, "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", @@ -278,9 +253,6 @@ "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize total_tokens=max(total_tokens), timestamp=min(timestamp) by process_id\r\n| summarize\r\n DocumentsProcessed = count(),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDoc = round(avg(total_tokens), 0),\r\n MaxTokensPerDoc = max(total_tokens)\r\n by Day = bin(timestamp, 1d)\r\n| order by Day desc", "size": 0, "title": "Daily Processing Volume with Token Usage", - "timeContext": { - "durationMs": 2592000000 - }, "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", @@ -289,8 +261,14 @@ "name": "daily-volume" } ], + "isLocked": true, + "defaultResourceIds": [ + "/subscriptions/1d5876cd-7603-407a-96d2-ae5ca9a9c5f3/resourcegroups/rg-content-processing-token1/providers/microsoft.insights/components/appi-cptoken1kbxf2", + "/subscriptions/1d5876cd-7603-407a-96d2-ae5ca9a9c5f3/resourceGroups/rg-pgcpfeat/providers/Microsoft.Insights/components/appi-pgcpfeatuw333" + ], "fallbackResourceIds": [ - "Azure Monitor" + "/subscriptions/1d5876cd-7603-407a-96d2-ae5ca9a9c5f3/resourcegroups/rg-content-processing-token1/providers/microsoft.insights/components/appi-cptoken1kbxf2", + "/subscriptions/1d5876cd-7603-407a-96d2-ae5ca9a9c5f3/resourceGroups/rg-pgcpfeat/providers/Microsoft.Insights/components/appi-pgcpfeatuw333" ], "fromTemplateId": "community-Workbooks/Common/Templates" -} +} \ No newline at end of file From cdde8a397f53e8d714584908a52f7363721f5c7f Mon Sep 17 00:00:00 2001 From: Prachig-Microsoft Date: Fri, 15 May 2026 16:12:59 +0530 Subject: [PATCH 09/23] feat: Change Token Usage by File Type to grid visualization Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- infra/dashboards/token-usage-workbook.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infra/dashboards/token-usage-workbook.json b/infra/dashboards/token-usage-workbook.json index 3be53340..d83c00d8 100644 --- a/infra/dashboards/token-usage-workbook.json +++ b/infra/dashboards/token-usage-workbook.json @@ -213,7 +213,7 @@ "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", - "visualization": "barchart" + "visualization": "table" }, "name": "tokens-by-filetype" }, From 4e020ad4c53c3fc0f92b062bdf0b5d588bce340d Mon Sep 17 00:00:00 2001 From: Prachig-Microsoft Date: Fri, 15 May 2026 17:48:49 +0530 Subject: [PATCH 10/23] fix: derive overall token totals from per-agent events instead of broken summary event The LLM_Token_Usage_Summary event only captured MapHandler tokens because save_handler doesn't have visibility into Workflow service steps (Summarize, RAI, GapAnalysis). Fixed all queries that need total token counts to aggregate from LLM_Agent_Token_Usage events grouped by process_id. Queries fixed: - Overall Token Usage Summary - Token Usage Over Time - Estimated Daily Cost - Top 20 Token Consumers (join with Summary for file_name) - Token Usage by File Type (join with Summary for mime_type) - Token Usage Percentiles - Daily Processing Volume Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- infra/dashboards/token-usage-queries.kql | 66 +++++++++++++------ infra/dashboards/token-usage-workbook.json | 14 ++-- .../libs/pipeline/handlers/save_handler.py | 3 + 3 files changed, 56 insertions(+), 27 deletions(-) diff --git a/infra/dashboards/token-usage-queries.kql b/infra/dashboards/token-usage-queries.kql index a08cd640..a74e3ab8 100644 --- a/infra/dashboards/token-usage-queries.kql +++ b/infra/dashboards/token-usage-queries.kql @@ -6,11 +6,13 @@ // 1. Overall token usage summary (last 7 days) customEvents -| where name == 'LLM_Token_Usage_Summary' +| where name == 'LLM_Agent_Token_Usage' | where timestamp > ago(7d) -| extend input_tokens = toint(customDimensions['total_input_tokens']) -| extend output_tokens = toint(customDimensions['total_output_tokens']) +| extend process_id = tostring(customDimensions['process_id']) +| extend input_tokens = toint(customDimensions['input_tokens']) +| extend output_tokens = toint(customDimensions['output_tokens']) | extend total_tokens = toint(customDimensions['total_tokens']) +| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), total_tokens=sum(total_tokens) by process_id | summarize TotalDocuments = count(), TotalInputTokens = sum(input_tokens), @@ -36,10 +38,12 @@ customEvents // 3. Token usage over time (hourly) customEvents -| where name == 'LLM_Token_Usage_Summary' +| where name == 'LLM_Agent_Token_Usage' | where timestamp > ago(7d) -| extend input_tokens = toint(customDimensions['total_input_tokens']) -| extend output_tokens = toint(customDimensions['total_output_tokens']) +| extend process_id = tostring(customDimensions['process_id']) +| extend input_tokens = toint(customDimensions['input_tokens']) +| extend output_tokens = toint(customDimensions['output_tokens']) +| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), timestamp=min(timestamp) by process_id | summarize InputTokens = sum(input_tokens), OutputTokens = sum(output_tokens) by bin(timestamp, 1h) | order by timestamp asc | render areachart @@ -48,10 +52,12 @@ customEvents let input_price_per_million = 2.50; let output_price_per_million = 10.00; customEvents -| where name == 'LLM_Token_Usage_Summary' +| where name == 'LLM_Agent_Token_Usage' | where timestamp > ago(30d) -| extend input_tokens = toint(customDimensions['total_input_tokens']) -| extend output_tokens = toint(customDimensions['total_output_tokens']) +| extend process_id = tostring(customDimensions['process_id']) +| extend input_tokens = toint(customDimensions['input_tokens']) +| extend output_tokens = toint(customDimensions['output_tokens']) +| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), timestamp=min(timestamp) by process_id | summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by bin(timestamp, 1d) | extend InputCost = round(TotalInput * input_price_per_million / 1000000.0, 4) | extend OutputCost = round(TotalOutput * output_price_per_million / 1000000.0, 4) @@ -61,12 +67,20 @@ customEvents // 5. Top token consumers by document customEvents -| where name == 'LLM_Token_Usage_Summary' +| where name == 'LLM_Agent_Token_Usage' | where timestamp > ago(7d) -| extend total_tokens = toint(customDimensions['total_tokens']) | extend process_id = tostring(customDimensions['process_id']) -| extend file_name = tostring(customDimensions['file_name']) -| summarize TotalTokens = sum(total_tokens) by process_id, file_name +| extend total_tokens = toint(customDimensions['total_tokens']) +| summarize TotalTokens = sum(total_tokens) by process_id +| join kind=leftouter ( + customEvents + | where name == 'LLM_Token_Usage_Summary' + | where timestamp > ago(7d) + | extend process_id = tostring(customDimensions['process_id']) + | extend file_name = tostring(customDimensions['file_name']) + | summarize file_name=take_any(file_name) by process_id +) on process_id +| project process_id, file_name, TotalTokens | order by TotalTokens desc | take 20 @@ -81,9 +95,11 @@ customEvents // 7. Token usage percentiles per document customEvents -| where name == 'LLM_Token_Usage_Summary' +| where name == 'LLM_Agent_Token_Usage' | where timestamp > ago(7d) +| extend process_id = tostring(customDimensions['process_id']) | extend total_tokens = toint(customDimensions['total_tokens']) +| summarize total_tokens=sum(total_tokens) by process_id | summarize p50 = percentile(total_tokens, 50), p90 = percentile(total_tokens, 90), @@ -230,12 +246,21 @@ dependencies // 16. Token usage by file type (PDF, DOCX, image, etc.) customEvents -| where name == 'LLM_Token_Usage_Summary' +| where name == 'LLM_Agent_Token_Usage' | where timestamp > ago(7d) +| extend process_id = tostring(customDimensions['process_id']) +| extend input_tokens = toint(customDimensions['input_tokens']) +| extend output_tokens = toint(customDimensions['output_tokens']) | extend total_tokens = toint(customDimensions['total_tokens']) -| extend input_tokens = toint(customDimensions['total_input_tokens']) -| extend output_tokens = toint(customDimensions['total_output_tokens']) -| extend mime_type = tostring(customDimensions['file_mime_type']) +| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), total_tokens=sum(total_tokens) by process_id +| join kind=leftouter ( + customEvents + | where name == 'LLM_Token_Usage_Summary' + | where timestamp > ago(7d) + | extend process_id = tostring(customDimensions['process_id']) + | extend mime_type = tostring(customDimensions['file_mime_type']) + | summarize mime_type=take_any(mime_type) by process_id +) on process_id | extend file_type = case( mime_type has "pdf", "PDF", mime_type has "image", "Image", @@ -270,10 +295,11 @@ customEvents // 18. Daily processing volume with token costs customEvents -| where name == 'LLM_Token_Usage_Summary' +| where name == 'LLM_Agent_Token_Usage' | where timestamp > ago(30d) +| extend process_id = tostring(customDimensions['process_id']) | extend total_tokens = toint(customDimensions['total_tokens']) -| extend file_name = tostring(customDimensions['file_name']) +| summarize total_tokens=sum(total_tokens), timestamp=min(timestamp) by process_id | summarize DocumentsProcessed = count(), TotalTokens = sum(total_tokens), diff --git a/infra/dashboards/token-usage-workbook.json b/infra/dashboards/token-usage-workbook.json index d83c00d8..8562f0dd 100644 --- a/infra/dashboards/token-usage-workbook.json +++ b/infra/dashboards/token-usage-workbook.json @@ -59,7 +59,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\r\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by process_id\r\n| summarize\r\n TotalDocuments = count(),\r\n TotalInputTokens = sum(input_tokens),\r\n TotalOutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDocument = round(avg(total_tokens), 0)", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), total_tokens=sum(total_tokens) by process_id\r\n| summarize\r\n TotalDocuments = count(),\r\n TotalInputTokens = sum(input_tokens),\r\n TotalOutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDocument = round(avg(total_tokens), 0)", "size": 3, "title": "Overall Token Usage Summary", "timeContextFromParameter": "TimeRange", @@ -95,7 +95,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\r\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), timestamp=min(timestamp) by process_id\r\n| summarize InputTokens = sum(input_tokens), OutputTokens = sum(output_tokens) by bin(timestamp, 1h)\r\n| order by timestamp asc", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), timestamp=min(timestamp) by process_id\r\n| summarize InputTokens = sum(input_tokens), OutputTokens = sum(output_tokens) by bin(timestamp, 1h)\r\n| order by timestamp asc", "size": 0, "title": "Token Usage Over Time (Hourly)", "timeContextFromParameter": "TimeRange", @@ -130,7 +130,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "let input_price_per_million = 2.50;\r\nlet output_price_per_million = 10.00;\r\ncustomEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\r\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), timestamp=min(timestamp) by process_id\r\n| summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by bin(timestamp, 1d)\r\n| extend InputCost = round(TotalInput * input_price_per_million / 1000000.0, 4)\r\n| extend OutputCost = round(TotalOutput * output_price_per_million / 1000000.0, 4)\r\n| extend TotalCost = InputCost + OutputCost\r\n| project Day = timestamp, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost\r\n| order by Day desc", + "query": "let input_price_per_million = 2.50;\r\nlet output_price_per_million = 10.00;\r\ncustomEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), timestamp=min(timestamp) by process_id\r\n| summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by bin(timestamp, 1d)\r\n| extend InputCost = round(TotalInput * input_price_per_million / 1000000.0, 4)\r\n| extend OutputCost = round(TotalOutput * output_price_per_million / 1000000.0, 4)\r\n| extend TotalCost = InputCost + OutputCost\r\n| project Day = timestamp, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost\r\n| order by Day desc", "size": 0, "title": "Estimated Daily Cost (GPT-4o Pricing: $2.50/1M input, $10.00/1M output)", "timeContextFromParameter": "TimeRange", @@ -193,7 +193,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend file_name = tostring(customDimensions['file_name'])\r\n| summarize TotalTokens = max(total_tokens) by process_id, file_name\r\n| order by TotalTokens desc\r\n| take 20", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize TotalTokens = sum(total_tokens) by process_id\r\n| join kind=leftouter (\r\n customEvents\r\n | where name == 'LLM_Token_Usage_Summary'\r\n | where timestamp > ago(7d)\r\n | extend process_id = tostring(customDimensions['process_id'])\r\n | extend file_name = tostring(customDimensions['file_name'])\r\n | summarize file_name=take_any(file_name) by process_id\r\n) on process_id\r\n| project process_id, file_name, TotalTokens\r\n| order by TotalTokens desc\r\n| take 20", "size": 0, "title": "Top 20 Token Consumers by Document", "timeContextFromParameter": "TimeRange", @@ -207,7 +207,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| extend input_tokens = toint(customDimensions['total_input_tokens'])\r\n| extend output_tokens = toint(customDimensions['total_output_tokens'])\r\n| extend mime_type = tostring(customDimensions['file_mime_type'])\r\n| extend file_type = case(\r\n mime_type has \"pdf\", \"PDF\",\r\n mime_type has \"image\", \"Image\",\r\n mime_type has \"word\" or mime_type has \"docx\", \"Word\",\r\n mime_type has \"excel\" or mime_type has \"xlsx\", \"Excel\",\r\n mime_type has \"text\", \"Text\",\r\n \"Other\")\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens), file_type=take_any(file_type) by process_id\r\n| summarize\r\n Documents = count(),\r\n TotalInputTokens = sum(input_tokens),\r\n TotalOutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDoc = round(avg(total_tokens), 0)\r\n by FileType = file_type\r\n| order by TotalTokens desc", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), total_tokens=sum(total_tokens) by process_id\r\n| join kind=leftouter (\r\n customEvents\r\n | where name == 'LLM_Token_Usage_Summary'\r\n | where timestamp > ago(7d)\r\n | extend process_id = tostring(customDimensions['process_id'])\r\n | extend mime_type = tostring(customDimensions['file_mime_type'])\r\n | summarize mime_type=take_any(mime_type) by process_id\r\n) on process_id\r\n| extend file_type = case(\r\n mime_type has \"pdf\", \"PDF\",\r\n mime_type has \"image\", \"Image\",\r\n mime_type has \"word\" or mime_type has \"docx\", \"Word\",\r\n mime_type has \"excel\" or mime_type has \"xlsx\", \"Excel\",\r\n mime_type has \"text\", \"Text\",\r\n \"Other\")\r\n| summarize\r\n Documents = count(),\r\n TotalInputTokens = sum(input_tokens),\r\n TotalOutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDoc = round(avg(total_tokens), 0)\r\n by FileType = file_type\r\n| order by TotalTokens desc", "size": 0, "title": "Token Usage by File Type", "timeContextFromParameter": "TimeRange", @@ -228,7 +228,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize total_tokens=max(total_tokens) by process_id\r\n| summarize\r\n p50 = percentile(total_tokens, 50),\r\n p90 = percentile(total_tokens, 90),\r\n p95 = percentile(total_tokens, 95),\r\n p99 = percentile(total_tokens, 99),\r\n Max = max(total_tokens)", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize total_tokens=sum(total_tokens) by process_id\r\n| summarize\r\n p50 = percentile(total_tokens, 50),\r\n p90 = percentile(total_tokens, 90),\r\n p95 = percentile(total_tokens, 95),\r\n p99 = percentile(total_tokens, 99),\r\n Max = max(total_tokens)", "size": 3, "title": "Token Usage Percentiles Per Document", "timeContextFromParameter": "TimeRange", @@ -250,7 +250,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Token_Usage_Summary'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize total_tokens=max(total_tokens), timestamp=min(timestamp) by process_id\r\n| summarize\r\n DocumentsProcessed = count(),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDoc = round(avg(total_tokens), 0),\r\n MaxTokensPerDoc = max(total_tokens)\r\n by Day = bin(timestamp, 1d)\r\n| order by Day desc", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize total_tokens=sum(total_tokens), timestamp=min(timestamp) by process_id\r\n| summarize\r\n DocumentsProcessed = count(),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDoc = round(avg(total_tokens), 0),\r\n MaxTokensPerDoc = max(total_tokens)\r\n by Day = bin(timestamp, 1d)\r\n| order by Day desc", "size": 0, "title": "Daily Processing Volume with Token Usage", "timeContextFromParameter": "TimeRange", diff --git a/src/ContentProcessor/src/libs/pipeline/handlers/save_handler.py b/src/ContentProcessor/src/libs/pipeline/handlers/save_handler.py index 33c15a66..ed624069 100644 --- a/src/ContentProcessor/src/libs/pipeline/handlers/save_handler.py +++ b/src/ContentProcessor/src/libs/pipeline/handlers/save_handler.py @@ -170,6 +170,9 @@ def find_process_result(step_name: str): ) # Emit token usage summary and per-model events to Application Insights + # NOTE: This summary only contains tokens from the evaluate/map step. + # For true totals across all pipeline steps (Summarize, RAI, GapAnalysis), + # aggregate from LLM_Agent_Token_Usage events grouped by process_id. emit_summary_token_event( total_input_tokens=evaluated_result.prompt_tokens, total_output_tokens=evaluated_result.completion_tokens, From c91335253b05816c311625aea5e197b3d52412b3 Mon Sep 17 00:00:00 2001 From: Prachig-Microsoft Date: Fri, 15 May 2026 17:50:40 +0530 Subject: [PATCH 11/23] feat: add processing time queries per pipeline step and per document Added 3 new queries/panels: - Processing Time by Pipeline Step (avg, p50, p90, max per step) - Total Document Processing Time (end-to-end workflow summary) - Per-Document Processing Time Breakdown (individual step timing) Queries parse elapsed time from traces logged by queue_handler_base and claim_processor workflow. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- infra/dashboards/token-usage-queries.kql | 61 +++++++++++++++++++++- infra/dashboards/token-usage-workbook.json | 49 +++++++++++++++++ 2 files changed, 109 insertions(+), 1 deletion(-) diff --git a/infra/dashboards/token-usage-queries.kql b/infra/dashboards/token-usage-queries.kql index a74e3ab8..c94d06b4 100644 --- a/infra/dashboards/token-usage-queries.kql +++ b/infra/dashboards/token-usage-queries.kql @@ -293,7 +293,66 @@ customEvents by process_id, Step = agent | order by process_id, TotalTokens desc -// 18. Daily processing volume with token costs +// 18. Processing time per pipeline step +traces +| where timestamp > ago(7d) +| where message has "Pipeline stage completed" +| parse message with * "process_id=" process_id ", document=" document ", stage=" stage ", elapsed=" elapsed +| where isnotempty(stage) +| extend elapsed_parts = split(elapsed, ":") +| extend hours = toint(elapsed_parts[0]) +| extend mins = toint(elapsed_parts[1]) +| extend sec_parts = split(tostring(elapsed_parts[2]), ".") +| extend secs = toint(sec_parts[0]) +| extend ms = toint(sec_parts[1]) +| extend elapsed_seconds = hours * 3600.0 + mins * 60.0 + secs + ms / 100.0 +| summarize + AvgSeconds = round(avg(elapsed_seconds), 2), + P50Seconds = round(percentile(elapsed_seconds, 50), 2), + P90Seconds = round(percentile(elapsed_seconds, 90), 2), + MaxSeconds = round(max(elapsed_seconds), 2), + Invocations = count() + by Step = stage +| order by AvgSeconds desc + +// 19. Total document processing time (end-to-end workflow) +traces +| where timestamp > ago(7d) +| where message has "Workflow elapsed time" +| parse message with * "Workflow elapsed time: " elapsed " (start=" * ", end=" * +| where isnotempty(elapsed) +| extend elapsed_parts = split(elapsed, ":") +| extend hours = toint(elapsed_parts[0]) +| extend mins = toint(elapsed_parts[1]) +| extend sec_parts = split(tostring(elapsed_parts[2]), ".") +| extend secs = toint(sec_parts[0]) +| extend ms = toint(sec_parts[1]) +| extend elapsed_seconds = hours * 3600.0 + mins * 60.0 + secs + ms / 100.0 +| summarize + DocumentsProcessed = count(), + AvgSeconds = round(avg(elapsed_seconds), 2), + P50Seconds = round(percentile(elapsed_seconds, 50), 2), + P90Seconds = round(percentile(elapsed_seconds, 90), 2), + P95Seconds = round(percentile(elapsed_seconds, 95), 2), + MaxSeconds = round(max(elapsed_seconds), 2) + +// 20. Per-document processing time breakdown by step +traces +| where timestamp > ago(7d) +| where message has "Pipeline stage completed" +| parse message with * "process_id=" process_id ", document=" document ", stage=" stage ", elapsed=" elapsed +| where isnotempty(stage) +| extend elapsed_parts = split(elapsed, ":") +| extend hours = toint(elapsed_parts[0]) +| extend mins = toint(elapsed_parts[1]) +| extend sec_parts = split(tostring(elapsed_parts[2]), ".") +| extend secs = toint(sec_parts[0]) +| extend ms = toint(sec_parts[1]) +| extend elapsed_seconds = round(hours * 3600.0 + mins * 60.0 + secs + ms / 100.0, 2) +| project timestamp, process_id, document, Step=stage, ElapsedSeconds=elapsed_seconds +| order by process_id, timestamp asc + +// 21. Daily processing volume with token costs customEvents | where name == 'LLM_Agent_Token_Usage' | where timestamp > ago(30d) diff --git a/infra/dashboards/token-usage-workbook.json b/infra/dashboards/token-usage-workbook.json index 8562f0dd..ea44344c 100644 --- a/infra/dashboards/token-usage-workbook.json +++ b/infra/dashboards/token-usage-workbook.json @@ -217,6 +217,55 @@ }, "name": "tokens-by-filetype" }, + { + "type": 1, + "content": { + "json": "---\n## Processing Time" + }, + "name": "section-processing-time" + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "traces\r\n| where timestamp > ago(7d)\r\n| where message has \"Pipeline stage completed\"\r\n| parse message with * \"process_id=\" process_id \", document=\" document \", stage=\" stage \", elapsed=\" elapsed\r\n| where isnotempty(stage)\r\n| extend elapsed_parts = split(elapsed, \":\")\r\n| extend hours = toint(elapsed_parts[0])\r\n| extend mins = toint(elapsed_parts[1])\r\n| extend sec_parts = split(tostring(elapsed_parts[2]), \".\")\r\n| extend secs = toint(sec_parts[0])\r\n| extend ms = toint(sec_parts[1])\r\n| extend elapsed_seconds = hours * 3600.0 + mins * 60.0 + secs + ms / 100.0\r\n| summarize\r\n AvgSeconds = round(avg(elapsed_seconds), 2),\r\n P50Seconds = round(percentile(elapsed_seconds, 50), 2),\r\n P90Seconds = round(percentile(elapsed_seconds, 90), 2),\r\n MaxSeconds = round(max(elapsed_seconds), 2),\r\n Invocations = count()\r\n by Step = stage\r\n| order by AvgSeconds desc", + "size": 0, + "title": "Processing Time by Pipeline Step", + "timeContextFromParameter": "TimeRange", + "queryType": 0, + "resourceType": "microsoft.insights/components", + "visualization": "table" + }, + "name": "processing-time-by-step" + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "traces\r\n| where timestamp > ago(7d)\r\n| where message has \"Workflow elapsed time\"\r\n| parse message with * \"Workflow elapsed time: \" elapsed \" (start=\" * \", end=\" *\r\n| where isnotempty(elapsed)\r\n| extend elapsed_parts = split(elapsed, \":\")\r\n| extend hours = toint(elapsed_parts[0])\r\n| extend mins = toint(elapsed_parts[1])\r\n| extend sec_parts = split(tostring(elapsed_parts[2]), \".\")\r\n| extend secs = toint(sec_parts[0])\r\n| extend ms = toint(sec_parts[1])\r\n| extend elapsed_seconds = hours * 3600.0 + mins * 60.0 + secs + ms / 100.0\r\n| summarize\r\n DocumentsProcessed = count(),\r\n AvgSeconds = round(avg(elapsed_seconds), 2),\r\n P50Seconds = round(percentile(elapsed_seconds, 50), 2),\r\n P90Seconds = round(percentile(elapsed_seconds, 90), 2),\r\n P95Seconds = round(percentile(elapsed_seconds, 95), 2),\r\n MaxSeconds = round(max(elapsed_seconds), 2)", + "size": 3, + "title": "Total Document Processing Time (End-to-End)", + "timeContextFromParameter": "TimeRange", + "queryType": 0, + "resourceType": "microsoft.insights/components", + "visualization": "table" + }, + "name": "total-processing-time" + }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "traces\r\n| where timestamp > ago(7d)\r\n| where message has \"Pipeline stage completed\"\r\n| parse message with * \"process_id=\" process_id \", document=\" document \", stage=\" stage \", elapsed=\" elapsed\r\n| where isnotempty(stage)\r\n| extend elapsed_parts = split(elapsed, \":\")\r\n| extend hours = toint(elapsed_parts[0])\r\n| extend mins = toint(elapsed_parts[1])\r\n| extend sec_parts = split(tostring(elapsed_parts[2]), \".\")\r\n| extend secs = toint(sec_parts[0])\r\n| extend ms = toint(sec_parts[1])\r\n| extend elapsed_seconds = round(hours * 3600.0 + mins * 60.0 + secs + ms / 100.0, 2)\r\n| project timestamp, process_id, document, Step=stage, ElapsedSeconds=elapsed_seconds\r\n| order by process_id, timestamp asc", + "size": 0, + "title": "Per-Document Processing Time Breakdown", + "timeContextFromParameter": "TimeRange", + "queryType": 0, + "resourceType": "microsoft.insights/components", + "visualization": "table" + }, + "name": "per-document-time-breakdown" + }, { "type": 1, "content": { From a6dcfbece156f133882d5e30e542bc24f10fbfa8 Mon Sep 17 00:00:00 2001 From: Prachig-Microsoft Date: Fri, 15 May 2026 19:34:26 +0530 Subject: [PATCH 12/23] fix: rewrite processing time queries to use customEvents timestamps The trace-based queries weren't working because Python Azure Monitor SDK stores log templates and args separately. New approach: - Step Completion Time: uses LLM_Agent_Token_Usage event timestamps to compute time from doc start to each step's completion - OpenAI API Call Durations: uses dependencies table for HTTP call times - Per-Document Step Timeline: shows when each step completed per document - Total Processing Time: first-to-last step duration per document All queries now use customEvents/dependencies tables which reliably capture data from both ContentProcessor and ContentProcessorWorkflow. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- infra/dashboards/token-usage-queries.kql | 109 ++++++++++++--------- infra/dashboards/token-usage-workbook.json | 30 ++++-- 2 files changed, 82 insertions(+), 57 deletions(-) diff --git a/infra/dashboards/token-usage-queries.kql b/infra/dashboards/token-usage-queries.kql index c94d06b4..080b6d60 100644 --- a/infra/dashboards/token-usage-queries.kql +++ b/infra/dashboards/token-usage-queries.kql @@ -293,65 +293,76 @@ customEvents by process_id, Step = agent | order by process_id, TotalTokens desc -// 18. Processing time per pipeline step -traces +// 18. Step completion time (seconds from document start to step completion) +customEvents +| where name == 'LLM_Agent_Token_Usage' | where timestamp > ago(7d) -| where message has "Pipeline stage completed" -| parse message with * "process_id=" process_id ", document=" document ", stage=" stage ", elapsed=" elapsed -| where isnotempty(stage) -| extend elapsed_parts = split(elapsed, ":") -| extend hours = toint(elapsed_parts[0]) -| extend mins = toint(elapsed_parts[1]) -| extend sec_parts = split(tostring(elapsed_parts[2]), ".") -| extend secs = toint(sec_parts[0]) -| extend ms = toint(sec_parts[1]) -| extend elapsed_seconds = hours * 3600.0 + mins * 60.0 + secs + ms / 100.0 +| extend agent = tostring(customDimensions['agent_name']) +| extend process_id = tostring(customDimensions['process_id']) +| join kind=inner ( + customEvents + | where name == 'LLM_Agent_Token_Usage' + | where timestamp > ago(7d) + | extend process_id = tostring(customDimensions['process_id']) + | summarize DocStartTime = min(timestamp) by process_id +) on process_id +| extend StepDurationSeconds = round(datetime_diff('millisecond', timestamp, DocStartTime) / 1000.0, 2) | summarize - AvgSeconds = round(avg(elapsed_seconds), 2), - P50Seconds = round(percentile(elapsed_seconds, 50), 2), - P90Seconds = round(percentile(elapsed_seconds, 90), 2), - MaxSeconds = round(max(elapsed_seconds), 2), + AvgCompletionTime = round(avg(StepDurationSeconds), 2), + P50CompletionTime = round(percentile(StepDurationSeconds, 50), 2), + P90CompletionTime = round(percentile(StepDurationSeconds, 90), 2), + MaxCompletionTime = round(max(StepDurationSeconds), 2), Invocations = count() - by Step = stage -| order by AvgSeconds desc + by Step = agent +| order by AvgCompletionTime desc -// 19. Total document processing time (end-to-end workflow) -traces +// 19. OpenAI API call durations from dependencies table +dependencies | where timestamp > ago(7d) -| where message has "Workflow elapsed time" -| parse message with * "Workflow elapsed time: " elapsed " (start=" * ", end=" * -| where isnotempty(elapsed) -| extend elapsed_parts = split(elapsed, ":") -| extend hours = toint(elapsed_parts[0]) -| extend mins = toint(elapsed_parts[1]) -| extend sec_parts = split(tostring(elapsed_parts[2]), ".") -| extend secs = toint(sec_parts[0]) -| extend ms = toint(sec_parts[1]) -| extend elapsed_seconds = hours * 3600.0 + mins * 60.0 + secs + ms / 100.0 +| where target has "openai" or name has "chat" or type == "HTTP" or name has "openai" +| where success == true +| extend durationSeconds = round(duration / 1000.0, 2) | summarize - DocumentsProcessed = count(), - AvgSeconds = round(avg(elapsed_seconds), 2), - P50Seconds = round(percentile(elapsed_seconds, 50), 2), - P90Seconds = round(percentile(elapsed_seconds, 90), 2), - P95Seconds = round(percentile(elapsed_seconds, 95), 2), - MaxSeconds = round(max(elapsed_seconds), 2) + TotalCalls = count(), + AvgSeconds = round(avg(durationSeconds), 2), + P50Seconds = round(percentile(durationSeconds, 50), 2), + P90Seconds = round(percentile(durationSeconds, 90), 2), + MaxSeconds = round(max(durationSeconds), 2) + by OperationName = name +| order by TotalCalls desc +| take 10 -// 20. Per-document processing time breakdown by step -traces +// 20. Per-document step timeline +customEvents +| where name == 'LLM_Agent_Token_Usage' | where timestamp > ago(7d) -| where message has "Pipeline stage completed" -| parse message with * "process_id=" process_id ", document=" document ", stage=" stage ", elapsed=" elapsed -| where isnotempty(stage) -| extend elapsed_parts = split(elapsed, ":") -| extend hours = toint(elapsed_parts[0]) -| extend mins = toint(elapsed_parts[1]) -| extend sec_parts = split(tostring(elapsed_parts[2]), ".") -| extend secs = toint(sec_parts[0]) -| extend ms = toint(sec_parts[1]) -| extend elapsed_seconds = round(hours * 3600.0 + mins * 60.0 + secs + ms / 100.0, 2) -| project timestamp, process_id, document, Step=stage, ElapsedSeconds=elapsed_seconds +| extend agent = tostring(customDimensions['agent_name']) +| extend process_id = tostring(customDimensions['process_id']) +| join kind=inner ( + customEvents + | where name == 'LLM_Agent_Token_Usage' + | where timestamp > ago(7d) + | extend process_id = tostring(customDimensions['process_id']) + | summarize DocStartTime = min(timestamp) by process_id +) on process_id +| extend StepCompletedAt = round(datetime_diff('millisecond', timestamp, DocStartTime) / 1000.0, 2) +| project timestamp, process_id, Step=agent, StepCompletedAtSeconds=StepCompletedAt | order by process_id, timestamp asc +// 21. Total document processing time (first to last step) +customEvents +| where name == 'LLM_Agent_Token_Usage' +| where timestamp > ago(7d) +| extend process_id = tostring(customDimensions['process_id']) +| summarize StartTime = min(timestamp), EndTime = max(timestamp) by process_id +| extend TotalSeconds = round(datetime_diff('millisecond', EndTime, StartTime) / 1000.0, 2) +| summarize + DocumentsProcessed = count(), + AvgSeconds = round(avg(TotalSeconds), 2), + P50Seconds = round(percentile(TotalSeconds, 50), 2), + P90Seconds = round(percentile(TotalSeconds, 90), 2), + MaxSeconds = round(max(TotalSeconds), 2) + // 21. Daily processing volume with token costs customEvents | where name == 'LLM_Agent_Token_Usage' diff --git a/infra/dashboards/token-usage-workbook.json b/infra/dashboards/token-usage-workbook.json index ea44344c..8cec143c 100644 --- a/infra/dashboards/token-usage-workbook.json +++ b/infra/dashboards/token-usage-workbook.json @@ -228,9 +228,9 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "traces\r\n| where timestamp > ago(7d)\r\n| where message has \"Pipeline stage completed\"\r\n| parse message with * \"process_id=\" process_id \", document=\" document \", stage=\" stage \", elapsed=\" elapsed\r\n| where isnotempty(stage)\r\n| extend elapsed_parts = split(elapsed, \":\")\r\n| extend hours = toint(elapsed_parts[0])\r\n| extend mins = toint(elapsed_parts[1])\r\n| extend sec_parts = split(tostring(elapsed_parts[2]), \".\")\r\n| extend secs = toint(sec_parts[0])\r\n| extend ms = toint(sec_parts[1])\r\n| extend elapsed_seconds = hours * 3600.0 + mins * 60.0 + secs + ms / 100.0\r\n| summarize\r\n AvgSeconds = round(avg(elapsed_seconds), 2),\r\n P50Seconds = round(percentile(elapsed_seconds, 50), 2),\r\n P90Seconds = round(percentile(elapsed_seconds, 90), 2),\r\n MaxSeconds = round(max(elapsed_seconds), 2),\r\n Invocations = count()\r\n by Step = stage\r\n| order by AvgSeconds desc", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| join kind=inner (\r\n customEvents\r\n | where name == 'LLM_Agent_Token_Usage'\r\n | where timestamp > ago(7d)\r\n | extend process_id = tostring(customDimensions['process_id'])\r\n | summarize DocStartTime = min(timestamp) by process_id\r\n) on process_id\r\n| extend StepDurationSeconds = round(datetime_diff('millisecond', timestamp, DocStartTime) / 1000.0, 2)\r\n| summarize\r\n AvgCompletionTime = round(avg(StepDurationSeconds), 2),\r\n P50CompletionTime = round(percentile(StepDurationSeconds, 50), 2),\r\n P90CompletionTime = round(percentile(StepDurationSeconds, 90), 2),\r\n MaxCompletionTime = round(max(StepDurationSeconds), 2),\r\n Invocations = count()\r\n by Step = agent\r\n| order by AvgCompletionTime desc", "size": 0, - "title": "Processing Time by Pipeline Step", + "title": "Step Completion Time (seconds from doc start)", "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", @@ -242,23 +242,23 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "traces\r\n| where timestamp > ago(7d)\r\n| where message has \"Workflow elapsed time\"\r\n| parse message with * \"Workflow elapsed time: \" elapsed \" (start=\" * \", end=\" *\r\n| where isnotempty(elapsed)\r\n| extend elapsed_parts = split(elapsed, \":\")\r\n| extend hours = toint(elapsed_parts[0])\r\n| extend mins = toint(elapsed_parts[1])\r\n| extend sec_parts = split(tostring(elapsed_parts[2]), \".\")\r\n| extend secs = toint(sec_parts[0])\r\n| extend ms = toint(sec_parts[1])\r\n| extend elapsed_seconds = hours * 3600.0 + mins * 60.0 + secs + ms / 100.0\r\n| summarize\r\n DocumentsProcessed = count(),\r\n AvgSeconds = round(avg(elapsed_seconds), 2),\r\n P50Seconds = round(percentile(elapsed_seconds, 50), 2),\r\n P90Seconds = round(percentile(elapsed_seconds, 90), 2),\r\n P95Seconds = round(percentile(elapsed_seconds, 95), 2),\r\n MaxSeconds = round(max(elapsed_seconds), 2)", - "size": 3, - "title": "Total Document Processing Time (End-to-End)", + "query": "dependencies\r\n| where timestamp > ago(7d)\r\n| where target has \"openai\" or name has \"chat\" or type == \"HTTP\" or name has \"openai\"\r\n| where success == true\r\n| extend durationSeconds = round(duration / 1000.0, 2)\r\n| summarize\r\n TotalCalls = count(),\r\n AvgSeconds = round(avg(durationSeconds), 2),\r\n P50Seconds = round(percentile(durationSeconds, 50), 2),\r\n P90Seconds = round(percentile(durationSeconds, 90), 2),\r\n MaxSeconds = round(max(durationSeconds), 2)\r\n by OperationName = name\r\n| order by TotalCalls desc\r\n| take 10", + "size": 0, + "title": "OpenAI API Call Durations", "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", "visualization": "table" }, - "name": "total-processing-time" + "name": "openai-call-durations" }, { "type": 3, "content": { "version": "KqlItem/1.0", - "query": "traces\r\n| where timestamp > ago(7d)\r\n| where message has \"Pipeline stage completed\"\r\n| parse message with * \"process_id=\" process_id \", document=\" document \", stage=\" stage \", elapsed=\" elapsed\r\n| where isnotempty(stage)\r\n| extend elapsed_parts = split(elapsed, \":\")\r\n| extend hours = toint(elapsed_parts[0])\r\n| extend mins = toint(elapsed_parts[1])\r\n| extend sec_parts = split(tostring(elapsed_parts[2]), \".\")\r\n| extend secs = toint(sec_parts[0])\r\n| extend ms = toint(sec_parts[1])\r\n| extend elapsed_seconds = round(hours * 3600.0 + mins * 60.0 + secs + ms / 100.0, 2)\r\n| project timestamp, process_id, document, Step=stage, ElapsedSeconds=elapsed_seconds\r\n| order by process_id, timestamp asc", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| join kind=inner (\r\n customEvents\r\n | where name == 'LLM_Agent_Token_Usage'\r\n | where timestamp > ago(7d)\r\n | extend process_id = tostring(customDimensions['process_id'])\r\n | summarize DocStartTime = min(timestamp) by process_id\r\n) on process_id\r\n| extend StepCompletedAt = round(datetime_diff('millisecond', timestamp, DocStartTime) / 1000.0, 2)\r\n| project timestamp, process_id, Step=agent, StepCompletedAtSeconds=StepCompletedAt\r\n| order by process_id, timestamp asc", "size": 0, - "title": "Per-Document Processing Time Breakdown", + "title": "Per-Document Step Timeline", "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", @@ -266,6 +266,20 @@ }, "name": "per-document-time-breakdown" }, + { + "type": 3, + "content": { + "version": "KqlItem/1.0", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| summarize StartTime = min(timestamp), EndTime = max(timestamp) by process_id\r\n| extend TotalSeconds = round(datetime_diff('millisecond', EndTime, StartTime) / 1000.0, 2)\r\n| summarize\r\n DocumentsProcessed = count(),\r\n AvgSeconds = round(avg(TotalSeconds), 2),\r\n P50Seconds = round(percentile(TotalSeconds, 50), 2),\r\n P90Seconds = round(percentile(TotalSeconds, 90), 2),\r\n MaxSeconds = round(max(TotalSeconds), 2)", + "size": 3, + "title": "Total Document Processing Time (First to Last Step)", + "timeContextFromParameter": "TimeRange", + "queryType": 0, + "resourceType": "microsoft.insights/components", + "visualization": "table" + }, + "name": "total-processing-time" + }, { "type": 1, "content": { From 1e81e75ab29726b29cc3af486100bf44b6024da1 Mon Sep 17 00:00:00 2001 From: Prachig-Microsoft Date: Fri, 15 May 2026 21:33:52 +0530 Subject: [PATCH 13/23] chore: remove appi-pgcpfeatuw333 from workbook, keep only appi-cptoken1kbxf2 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- infra/dashboards/token-usage-workbook.json | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/infra/dashboards/token-usage-workbook.json b/infra/dashboards/token-usage-workbook.json index 8cec143c..122e18b1 100644 --- a/infra/dashboards/token-usage-workbook.json +++ b/infra/dashboards/token-usage-workbook.json @@ -326,12 +326,10 @@ ], "isLocked": true, "defaultResourceIds": [ - "/subscriptions/1d5876cd-7603-407a-96d2-ae5ca9a9c5f3/resourcegroups/rg-content-processing-token1/providers/microsoft.insights/components/appi-cptoken1kbxf2", - "/subscriptions/1d5876cd-7603-407a-96d2-ae5ca9a9c5f3/resourceGroups/rg-pgcpfeat/providers/Microsoft.Insights/components/appi-pgcpfeatuw333" + "/subscriptions/1d5876cd-7603-407a-96d2-ae5ca9a9c5f3/resourcegroups/rg-content-processing-token1/providers/microsoft.insights/components/appi-cptoken1kbxf2" ], "fallbackResourceIds": [ - "/subscriptions/1d5876cd-7603-407a-96d2-ae5ca9a9c5f3/resourcegroups/rg-content-processing-token1/providers/microsoft.insights/components/appi-cptoken1kbxf2", - "/subscriptions/1d5876cd-7603-407a-96d2-ae5ca9a9c5f3/resourceGroups/rg-pgcpfeat/providers/Microsoft.Insights/components/appi-pgcpfeatuw333" + "/subscriptions/1d5876cd-7603-407a-96d2-ae5ca9a9c5f3/resourcegroups/rg-content-processing-token1/providers/microsoft.insights/components/appi-cptoken1kbxf2" ], "fromTemplateId": "community-Workbooks/Common/Templates" } \ No newline at end of file From 5c9be247b53057291fa381815ad2f9d7b3bf0f82 Mon Sep 17 00:00:00 2001 From: Prachig-Microsoft Date: Fri, 15 May 2026 22:54:28 +0530 Subject: [PATCH 14/23] Fix token usage workbook: dedup events, time range params, portable resource IDs - All token queries now use max() by (agent, process_id) dedup pattern to handle duplicate custom events from ContentProcessor/Workflow - Replaced hardcoded ago(7d) with TimeRange workbook parameter so the time picker actually filters data correctly - Removed hardcoded App Insights resource IDs from workbook JSON making it portable for redeployment to any RG via deploy-workbook.bicep - Updated token-usage-queries.kql to match all workbook queries with dedup pattern and reorganized (16 queries total) - Fixed registries: null in main.bicep/main.json for container apps Queries fixed with dedup: Overall Summary, Pipeline Step, Token Over Time, Daily Cost, Step-to-Model, Top 20 Consumers, Token by File Type, Percentiles, Daily Volume, Pie Chart Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- infra/dashboards/token-usage-queries.kql | 245 ++++++++------------- infra/dashboards/token-usage-workbook.json | 40 ++-- infra/main.bicep | 49 ++++- infra/main.json | 103 +++++---- 4 files changed, 216 insertions(+), 221 deletions(-) diff --git a/infra/dashboards/token-usage-queries.kql b/infra/dashboards/token-usage-queries.kql index 080b6d60..c5070943 100644 --- a/infra/dashboards/token-usage-queries.kql +++ b/infra/dashboards/token-usage-queries.kql @@ -2,16 +2,25 @@ // KQL Queries for LLM Token Usage Monitoring // Content Processing Solution Accelerator // Run these in Application Insights > Logs +// +// IMPORTANT: All queries use a deduplication pattern: +// max() by (agent, process_id) first, then sum() +// This handles duplicate custom events that can occur when +// both ContentProcessor and ContentProcessorWorkflow emit +// telemetry through the same Application Insights instance. // ============================================================ -// 1. Overall token usage summary (last 7 days) +// 1. Overall token usage summary +// Dedup: max by (agent, process_id) → sum by process_id → aggregate customEvents | where name == 'LLM_Agent_Token_Usage' | where timestamp > ago(7d) | extend process_id = tostring(customDimensions['process_id']) +| extend agent = tostring(customDimensions['agent_name']) | extend input_tokens = toint(customDimensions['input_tokens']) | extend output_tokens = toint(customDimensions['output_tokens']) | extend total_tokens = toint(customDimensions['total_tokens']) +| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by agent, process_id | summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), total_tokens=sum(total_tokens) by process_id | summarize TotalDocuments = count(), @@ -21,19 +30,23 @@ customEvents AvgTokensPerDocument = round(avg(total_tokens), 0) // 2. Token usage by pipeline step (agent) +// Dedup: max by (agent, process_id) → sum by agent customEvents | where name == 'LLM_Agent_Token_Usage' | where timestamp > ago(7d) | extend agent = tostring(customDimensions['agent_name']) +| extend process_id = tostring(customDimensions['process_id']) | extend input_tokens = toint(customDimensions['input_tokens']) | extend output_tokens = toint(customDimensions['output_tokens']) | extend total_tokens = toint(customDimensions['total_tokens']) +| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by agent, process_id | summarize InputTokens = sum(input_tokens), OutputTokens = sum(output_tokens), TotalTokens = sum(total_tokens), Invocations = count() by Step = agent +| project Step, InputTokens, OutputTokens, TotalTokens, Invocations | order by TotalTokens desc // 3. Token usage over time (hourly) @@ -41,22 +54,37 @@ customEvents | where name == 'LLM_Agent_Token_Usage' | where timestamp > ago(7d) | extend process_id = tostring(customDimensions['process_id']) +| extend agent = tostring(customDimensions['agent_name']) | extend input_tokens = toint(customDimensions['input_tokens']) | extend output_tokens = toint(customDimensions['output_tokens']) +| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), timestamp=min(timestamp) by agent, process_id | summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), timestamp=min(timestamp) by process_id | summarize InputTokens = sum(input_tokens), OutputTokens = sum(output_tokens) by bin(timestamp, 1h) | order by timestamp asc | render areachart -// 4. Estimated cost (GPT-4o pricing: $2.50/1M input, $10.00/1M output) +// 4. Token distribution by agent (pie chart) +customEvents +| where name == 'LLM_Agent_Token_Usage' +| where timestamp > ago(7d) +| extend agent = tostring(customDimensions['agent_name']) +| extend process_id = tostring(customDimensions['process_id']) +| extend total_tokens = toint(customDimensions['total_tokens']) +| summarize total_tokens=max(total_tokens) by agent, process_id +| summarize TotalTokens = sum(total_tokens) by agent +| render piechart + +// 5. Estimated daily cost (GPT-4o pricing: $2.50/1M input, $10.00/1M output) let input_price_per_million = 2.50; let output_price_per_million = 10.00; customEvents | where name == 'LLM_Agent_Token_Usage' | where timestamp > ago(30d) | extend process_id = tostring(customDimensions['process_id']) +| extend agent = tostring(customDimensions['agent_name']) | extend input_tokens = toint(customDimensions['input_tokens']) | extend output_tokens = toint(customDimensions['output_tokens']) +| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), timestamp=min(timestamp) by agent, process_id | summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), timestamp=min(timestamp) by process_id | summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by bin(timestamp, 1d) | extend InputCost = round(TotalInput * input_price_per_million / 1000000.0, 4) @@ -65,109 +93,7 @@ customEvents | project Day = timestamp, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost | order by Day desc -// 5. Top token consumers by document -customEvents -| where name == 'LLM_Agent_Token_Usage' -| where timestamp > ago(7d) -| extend process_id = tostring(customDimensions['process_id']) -| extend total_tokens = toint(customDimensions['total_tokens']) -| summarize TotalTokens = sum(total_tokens) by process_id -| join kind=leftouter ( - customEvents - | where name == 'LLM_Token_Usage_Summary' - | where timestamp > ago(7d) - | extend process_id = tostring(customDimensions['process_id']) - | extend file_name = tostring(customDimensions['file_name']) - | summarize file_name=take_any(file_name) by process_id -) on process_id -| project process_id, file_name, TotalTokens -| order by TotalTokens desc -| take 20 - -// 6. Pipeline step token distribution (pie chart) -customEvents -| where name == 'LLM_Agent_Token_Usage' -| where timestamp > ago(7d) -| extend agent = tostring(customDimensions['agent_name']) -| extend total_tokens = toint(customDimensions['total_tokens']) -| summarize TotalTokens = sum(total_tokens) by agent -| render piechart - -// 7. Token usage percentiles per document -customEvents -| where name == 'LLM_Agent_Token_Usage' -| where timestamp > ago(7d) -| extend process_id = tostring(customDimensions['process_id']) -| extend total_tokens = toint(customDimensions['total_tokens']) -| summarize total_tokens=sum(total_tokens) by process_id -| summarize - p50 = percentile(total_tokens, 50), - p90 = percentile(total_tokens, 90), - p95 = percentile(total_tokens, 95), - p99 = percentile(total_tokens, 99), - Max = max(total_tokens) - -// 8. Token usage by step grouping (Extraction vs Analysis vs Safety) -let StepGroupMapping = datatable(agent:string, StepGroup:string) [ - "MapHandler", "Extraction", - "RAI", "Safety", - "Summarize", "Analysis", - "GapAnalysis", "Analysis" -]; -customEvents -| where name == 'LLM_Agent_Token_Usage' -| where timestamp > ago(7d) -| extend agent = tostring(customDimensions['agent_name']) -| extend input_tokens = toint(customDimensions['input_tokens']) -| extend output_tokens = toint(customDimensions['output_tokens']) -| extend total_tokens = toint(customDimensions['total_tokens']) -| lookup kind=leftouter StepGroupMapping on agent -| extend StepGroup = iff(isempty(StepGroup), "Unknown", StepGroup) -| summarize - TotalRequests = count(), - TotalInputTokens = sum(input_tokens), - TotalOutputTokens = sum(output_tokens), - TotalTokens = sum(total_tokens), - AvgTokensPerRequest = round(avg(total_tokens), 0) - by StepGroup -| order by TotalTokens desc - -// 9. Token usage by model deployment -customEvents -| where name == 'LLM_Model_Token_Usage' -| where timestamp > ago(7d) -| extend model = tostring(customDimensions['model_deployment_name']) -| extend input_tokens = toint(customDimensions['input_tokens']) -| extend output_tokens = toint(customDimensions['output_tokens']) -| extend total_tokens = toint(customDimensions['total_tokens']) -| summarize - InputTokens = sum(input_tokens), - OutputTokens = sum(output_tokens), - TotalTokens = sum(total_tokens), - Invocations = count() - by Model = model -| order by TotalTokens desc - -// 10. Token usage by model over time (hourly) -customEvents -| where name == 'LLM_Model_Token_Usage' -| where timestamp > ago(7d) -| extend model = tostring(customDimensions['model_deployment_name']) -| extend total_tokens = toint(customDimensions['total_tokens']) -| summarize TotalTokens = sum(total_tokens) by bin(timestamp, 1h), model -| order by timestamp asc -| render areachart - -// 11. Model token distribution (pie chart) -customEvents -| where name == 'LLM_Model_Token_Usage' -| where timestamp > ago(7d) -| extend model = tostring(customDimensions['model_deployment_name']) -| extend total_tokens = toint(customDimensions['total_tokens']) -| summarize TotalTokens = sum(total_tokens) by model -| render piechart - -// 12. Estimated cost by model (adjust pricing per model) +// 6. Estimated cost by model (adjust pricing per model) let gpt4o_input = 2.50; let gpt4o_output = 10.00; let gpt4o_mini_input = 0.15; @@ -191,11 +117,10 @@ customEvents | project Model = model, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost | order by TotalCost desc -// 13. Step-to-model mapping with token usage +// 7. Token usage by model deployment customEvents -| where name == 'LLM_Agent_Token_Usage' +| where name == 'LLM_Model_Token_Usage' | where timestamp > ago(7d) -| extend agent = tostring(customDimensions['agent_name']) | extend model = tostring(customDimensions['model_deployment_name']) | extend input_tokens = toint(customDimensions['input_tokens']) | extend output_tokens = toint(customDimensions['output_tokens']) @@ -205,53 +130,59 @@ customEvents OutputTokens = sum(output_tokens), TotalTokens = sum(total_tokens), Invocations = count() - by Step = agent, Model = model + by Model = model | order by TotalTokens desc -// 14. RAI agent specific token usage +// 8. Step-to-model mapping with token usage customEvents | where name == 'LLM_Agent_Token_Usage' | where timestamp > ago(7d) | extend agent = tostring(customDimensions['agent_name']) -| where agent == "RAI" +| extend model = tostring(customDimensions['model_deployment_name']) +| extend process_id = tostring(customDimensions['process_id']) | extend input_tokens = toint(customDimensions['input_tokens']) | extend output_tokens = toint(customDimensions['output_tokens']) | extend total_tokens = toint(customDimensions['total_tokens']) -| extend model = tostring(customDimensions['model_deployment_name']) +| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by agent, model, process_id | summarize InputTokens = sum(input_tokens), OutputTokens = sum(output_tokens), TotalTokens = sum(total_tokens), - Invocations = count() - by Model = model + Invocations = dcount(process_id) + by Step = agent, Model = model +| order by TotalTokens desc -// 15. OpenTelemetry auto-instrumented OpenAI calls (if available) -dependencies -| where name has "openai" or target has "openai" +// 9. Top 20 token consumers by document +customEvents +| where name == 'LLM_Agent_Token_Usage' | where timestamp > ago(7d) -| extend input_tokens = tolong(customDimensions["gen_ai.usage.input_tokens"]) -| extend output_tokens = tolong(customDimensions["gen_ai.usage.output_tokens"]) -| extend model = tostring(customDimensions["gen_ai.request.model"]) -| where isnotnull(input_tokens) -| summarize - Calls = count(), - TotalInput = sum(input_tokens), - TotalOutput = sum(output_tokens) - by model -| order by TotalInput desc - -// ============================================================ -// Content Processing Specific Queries -// ============================================================ +| extend process_id = tostring(customDimensions['process_id']) +| extend agent = tostring(customDimensions['agent_name']) +| extend total_tokens = toint(customDimensions['total_tokens']) +| summarize total_tokens=max(total_tokens) by agent, process_id +| summarize TotalTokens = sum(total_tokens) by process_id +| join kind=leftouter ( + customEvents + | where name == 'LLM_Token_Usage_Summary' + | where timestamp > ago(7d) + | extend process_id = tostring(customDimensions['process_id']) + | extend file_name = tostring(customDimensions['file_name']) + | summarize file_name=take_any(file_name) by process_id +) on process_id +| project process_id, file_name, TotalTokens +| order by TotalTokens desc +| take 20 -// 16. Token usage by file type (PDF, DOCX, image, etc.) +// 10. Token usage by file type (PDF, DOCX, image, etc.) customEvents | where name == 'LLM_Agent_Token_Usage' | where timestamp > ago(7d) | extend process_id = tostring(customDimensions['process_id']) +| extend agent = tostring(customDimensions['agent_name']) | extend input_tokens = toint(customDimensions['input_tokens']) | extend output_tokens = toint(customDimensions['output_tokens']) | extend total_tokens = toint(customDimensions['total_tokens']) +| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by agent, process_id | summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), total_tokens=sum(total_tokens) by process_id | join kind=leftouter ( customEvents @@ -277,23 +208,11 @@ customEvents by FileType = file_type | order by TotalTokens desc -// 17. Per-document token breakdown by step -customEvents -| where name == 'LLM_Agent_Token_Usage' -| where timestamp > ago(7d) -| extend agent = tostring(customDimensions['agent_name']) -| extend process_id = tostring(customDimensions['process_id']) -| extend input_tokens = toint(customDimensions['input_tokens']) -| extend output_tokens = toint(customDimensions['output_tokens']) -| extend total_tokens = toint(customDimensions['total_tokens']) -| summarize - InputTokens = sum(input_tokens), - OutputTokens = sum(output_tokens), - TotalTokens = sum(total_tokens) - by process_id, Step = agent -| order by process_id, TotalTokens desc +// ============================================================ +// Processing Time Queries +// ============================================================ -// 18. Step completion time (seconds from document start to step completion) +// 11. Step completion time (seconds from document start to step completion) customEvents | where name == 'LLM_Agent_Token_Usage' | where timestamp > ago(7d) @@ -316,7 +235,7 @@ customEvents by Step = agent | order by AvgCompletionTime desc -// 19. OpenAI API call durations from dependencies table +// 12. OpenAI API call durations from dependencies table dependencies | where timestamp > ago(7d) | where target has "openai" or name has "chat" or type == "HTTP" or name has "openai" @@ -332,7 +251,7 @@ dependencies | order by TotalCalls desc | take 10 -// 20. Per-document step timeline +// 13. Per-document step timeline customEvents | where name == 'LLM_Agent_Token_Usage' | where timestamp > ago(7d) @@ -349,7 +268,7 @@ customEvents | project timestamp, process_id, Step=agent, StepCompletedAtSeconds=StepCompletedAt | order by process_id, timestamp asc -// 21. Total document processing time (first to last step) +// 14. Total document processing time (first to last step) customEvents | where name == 'LLM_Agent_Token_Usage' | where timestamp > ago(7d) @@ -363,12 +282,34 @@ customEvents P90Seconds = round(percentile(TotalSeconds, 90), 2), MaxSeconds = round(max(TotalSeconds), 2) -// 21. Daily processing volume with token costs +// ============================================================ +// Percentiles & Trends +// ============================================================ + +// 15. Token usage percentiles per document +customEvents +| where name == 'LLM_Agent_Token_Usage' +| where timestamp > ago(7d) +| extend process_id = tostring(customDimensions['process_id']) +| extend agent = tostring(customDimensions['agent_name']) +| extend total_tokens = toint(customDimensions['total_tokens']) +| summarize total_tokens=max(total_tokens) by agent, process_id +| summarize total_tokens=sum(total_tokens) by process_id +| summarize + p50 = percentile(total_tokens, 50), + p90 = percentile(total_tokens, 90), + p95 = percentile(total_tokens, 95), + p99 = percentile(total_tokens, 99), + Max = max(total_tokens) + +// 16. Daily processing volume with token usage customEvents | where name == 'LLM_Agent_Token_Usage' | where timestamp > ago(30d) | extend process_id = tostring(customDimensions['process_id']) +| extend agent = tostring(customDimensions['agent_name']) | extend total_tokens = toint(customDimensions['total_tokens']) +| summarize total_tokens=max(total_tokens), timestamp=min(timestamp) by agent, process_id | summarize total_tokens=sum(total_tokens), timestamp=min(timestamp) by process_id | summarize DocumentsProcessed = count(), diff --git a/infra/dashboards/token-usage-workbook.json b/infra/dashboards/token-usage-workbook.json index 122e18b1..ad5e0688 100644 --- a/infra/dashboards/token-usage-workbook.json +++ b/infra/dashboards/token-usage-workbook.json @@ -59,7 +59,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), total_tokens=sum(total_tokens) by process_id\r\n| summarize\r\n TotalDocuments = count(),\r\n TotalInputTokens = sum(input_tokens),\r\n TotalOutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDocument = round(avg(total_tokens), 0)", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by agent, process_id\r\n| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), total_tokens=sum(total_tokens) by process_id\r\n| summarize\r\n TotalDocuments = count(),\r\n TotalInputTokens = sum(input_tokens),\r\n TotalOutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDocument = round(avg(total_tokens), 0)", "size": 3, "title": "Overall Token Usage Summary", "timeContextFromParameter": "TimeRange", @@ -81,7 +81,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by agent, process_id\r\n| summarize\r\n InputTokens = sum(input_tokens),\r\n OutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n Invocations = count()\r\n by Step = agent\r\n| project Step, InputTokens, OutputTokens, TotalTokens, Invocations\r\n| order by TotalTokens desc", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by agent, process_id\r\n| summarize\r\n InputTokens = sum(input_tokens),\r\n OutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n Invocations = count()\r\n by Step = agent\r\n| project Step, InputTokens, OutputTokens, TotalTokens, Invocations\r\n| order by TotalTokens desc", "size": 0, "title": "Token Usage by Pipeline Step", "timeContextFromParameter": "TimeRange", @@ -95,7 +95,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), timestamp=min(timestamp) by process_id\r\n| summarize InputTokens = sum(input_tokens), OutputTokens = sum(output_tokens) by bin(timestamp, 1h)\r\n| order by timestamp asc", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), timestamp=min(timestamp) by agent, process_id\r\n| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), timestamp=min(timestamp) by process_id\r\n| summarize InputTokens = sum(input_tokens), OutputTokens = sum(output_tokens) by bin(timestamp, 1h)\r\n| order by timestamp asc", "size": 0, "title": "Token Usage Over Time (Hourly)", "timeContextFromParameter": "TimeRange", @@ -109,7 +109,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize total_tokens=max(total_tokens) by agent, process_id\r\n| summarize TotalTokens = sum(total_tokens) by agent", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize total_tokens=max(total_tokens) by agent, process_id\r\n| summarize TotalTokens = sum(total_tokens) by agent", "size": 0, "title": "Token Distribution by Agent", "timeContextFromParameter": "TimeRange", @@ -130,7 +130,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "let input_price_per_million = 2.50;\r\nlet output_price_per_million = 10.00;\r\ncustomEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), timestamp=min(timestamp) by process_id\r\n| summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by bin(timestamp, 1d)\r\n| extend InputCost = round(TotalInput * input_price_per_million / 1000000.0, 4)\r\n| extend OutputCost = round(TotalOutput * output_price_per_million / 1000000.0, 4)\r\n| extend TotalCost = InputCost + OutputCost\r\n| project Day = timestamp, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost\r\n| order by Day desc", + "query": "let input_price_per_million = 2.50;\r\nlet output_price_per_million = 10.00;\r\ncustomEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), timestamp=min(timestamp) by agent, process_id\r\n| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), timestamp=min(timestamp) by process_id\r\n| summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by bin(timestamp, 1d)\r\n| extend InputCost = round(TotalInput * input_price_per_million / 1000000.0, 4)\r\n| extend OutputCost = round(TotalOutput * output_price_per_million / 1000000.0, 4)\r\n| extend TotalCost = InputCost + OutputCost\r\n| project Day = timestamp, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost\r\n| order by Day desc", "size": 0, "title": "Estimated Daily Cost (GPT-4o Pricing: $2.50/1M input, $10.00/1M output)", "timeContextFromParameter": "TimeRange", @@ -144,7 +144,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "let gpt4o_input = 2.50;\r\nlet gpt4o_output = 10.00;\r\nlet gpt4o_mini_input = 0.15;\r\nlet gpt4o_mini_output = 0.60;\r\ncustomEvents\r\n| where name == 'LLM_Model_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend model = tostring(customDimensions['model_deployment_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by model\r\n| extend InputPrice = case(model has \"mini\", gpt4o_mini_input, gpt4o_input)\r\n| extend OutputPrice = case(model has \"mini\", gpt4o_mini_output, gpt4o_output)\r\n| extend InputCost = round(TotalInput * InputPrice / 1000000.0, 4)\r\n| extend OutputCost = round(TotalOutput * OutputPrice / 1000000.0, 4)\r\n| extend TotalCost = InputCost + OutputCost\r\n| project Model = model, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost\r\n| order by TotalCost desc", + "query": "let gpt4o_input = 2.50;\r\nlet gpt4o_output = 10.00;\r\nlet gpt4o_mini_input = 0.15;\r\nlet gpt4o_mini_output = 0.60;\r\ncustomEvents\r\n| where name == 'LLM_Model_Token_Usage'\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| extend model = tostring(customDimensions['model_deployment_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by model\r\n| extend InputPrice = case(model has \"mini\", gpt4o_mini_input, gpt4o_input)\r\n| extend OutputPrice = case(model has \"mini\", gpt4o_mini_output, gpt4o_output)\r\n| extend InputCost = round(TotalInput * InputPrice / 1000000.0, 4)\r\n| extend OutputCost = round(TotalOutput * OutputPrice / 1000000.0, 4)\r\n| extend TotalCost = InputCost + OutputCost\r\n| project Model = model, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost\r\n| order by TotalCost desc", "size": 0, "title": "Estimated Cost by Model", "timeContextFromParameter": "TimeRange", @@ -165,7 +165,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Model_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend model = tostring(customDimensions['model_deployment_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize\r\n InputTokens = sum(input_tokens),\r\n OutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n Invocations = count()\r\n by Model = model\r\n| order by TotalTokens desc", + "query": "customEvents\r\n| where name == 'LLM_Model_Token_Usage'\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| extend model = tostring(customDimensions['model_deployment_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize\r\n InputTokens = sum(input_tokens),\r\n OutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n Invocations = count()\r\n by Model = model\r\n| order by TotalTokens desc", "size": 0, "title": "Token Usage by Model", "timeContextFromParameter": "TimeRange", @@ -179,7 +179,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend model = tostring(customDimensions['model_deployment_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize\r\n InputTokens = sum(input_tokens),\r\n OutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n Invocations = count()\r\n by Step = agent, Model = model\r\n| order by TotalTokens desc", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend model = tostring(customDimensions['model_deployment_name'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by agent, model, process_id\r\n| summarize\r\n InputTokens = sum(input_tokens),\r\n OutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n Invocations = dcount(process_id)\r\n by Step = agent, Model = model\r\n| order by TotalTokens desc", "size": 0, "title": "Step-to-Model Token Mapping", "timeContextFromParameter": "TimeRange", @@ -193,7 +193,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize TotalTokens = sum(total_tokens) by process_id\r\n| join kind=leftouter (\r\n customEvents\r\n | where name == 'LLM_Token_Usage_Summary'\r\n | where timestamp > ago(7d)\r\n | extend process_id = tostring(customDimensions['process_id'])\r\n | extend file_name = tostring(customDimensions['file_name'])\r\n | summarize file_name=take_any(file_name) by process_id\r\n) on process_id\r\n| project process_id, file_name, TotalTokens\r\n| order by TotalTokens desc\r\n| take 20", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize total_tokens=max(total_tokens) by agent, process_id\r\n| summarize TotalTokens = sum(total_tokens) by process_id\r\n| join kind=leftouter (\r\n customEvents\r\n | where name == 'LLM_Token_Usage_Summary'\r\n | where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n | extend process_id = tostring(customDimensions['process_id'])\r\n | extend file_name = tostring(customDimensions['file_name'])\r\n | summarize file_name=take_any(file_name) by process_id\r\n) on process_id\r\n| project process_id, file_name, TotalTokens\r\n| order by TotalTokens desc\r\n| take 20", "size": 0, "title": "Top 20 Token Consumers by Document", "timeContextFromParameter": "TimeRange", @@ -207,7 +207,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), total_tokens=sum(total_tokens) by process_id\r\n| join kind=leftouter (\r\n customEvents\r\n | where name == 'LLM_Token_Usage_Summary'\r\n | where timestamp > ago(7d)\r\n | extend process_id = tostring(customDimensions['process_id'])\r\n | extend mime_type = tostring(customDimensions['file_mime_type'])\r\n | summarize mime_type=take_any(mime_type) by process_id\r\n) on process_id\r\n| extend file_type = case(\r\n mime_type has \"pdf\", \"PDF\",\r\n mime_type has \"image\", \"Image\",\r\n mime_type has \"word\" or mime_type has \"docx\", \"Word\",\r\n mime_type has \"excel\" or mime_type has \"xlsx\", \"Excel\",\r\n mime_type has \"text\", \"Text\",\r\n \"Other\")\r\n| summarize\r\n Documents = count(),\r\n TotalInputTokens = sum(input_tokens),\r\n TotalOutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDoc = round(avg(total_tokens), 0)\r\n by FileType = file_type\r\n| order by TotalTokens desc", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by agent, process_id\r\n| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), total_tokens=sum(total_tokens) by process_id\r\n| join kind=leftouter (\r\n customEvents\r\n | where name == 'LLM_Token_Usage_Summary'\r\n | where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n | extend process_id = tostring(customDimensions['process_id'])\r\n | extend mime_type = tostring(customDimensions['file_mime_type'])\r\n | summarize mime_type=take_any(mime_type) by process_id\r\n) on process_id\r\n| extend file_type = case(\r\n mime_type has \"pdf\", \"PDF\",\r\n mime_type has \"image\", \"Image\",\r\n mime_type has \"word\" or mime_type has \"docx\", \"Word\",\r\n mime_type has \"excel\" or mime_type has \"xlsx\", \"Excel\",\r\n mime_type has \"text\", \"Text\",\r\n \"Other\")\r\n| summarize\r\n Documents = count(),\r\n TotalInputTokens = sum(input_tokens),\r\n TotalOutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDoc = round(avg(total_tokens), 0)\r\n by FileType = file_type\r\n| order by TotalTokens desc", "size": 0, "title": "Token Usage by File Type", "timeContextFromParameter": "TimeRange", @@ -228,7 +228,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| join kind=inner (\r\n customEvents\r\n | where name == 'LLM_Agent_Token_Usage'\r\n | where timestamp > ago(7d)\r\n | extend process_id = tostring(customDimensions['process_id'])\r\n | summarize DocStartTime = min(timestamp) by process_id\r\n) on process_id\r\n| extend StepDurationSeconds = round(datetime_diff('millisecond', timestamp, DocStartTime) / 1000.0, 2)\r\n| summarize\r\n AvgCompletionTime = round(avg(StepDurationSeconds), 2),\r\n P50CompletionTime = round(percentile(StepDurationSeconds, 50), 2),\r\n P90CompletionTime = round(percentile(StepDurationSeconds, 90), 2),\r\n MaxCompletionTime = round(max(StepDurationSeconds), 2),\r\n Invocations = count()\r\n by Step = agent\r\n| order by AvgCompletionTime desc", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| join kind=inner (\r\n customEvents\r\n | where name == 'LLM_Agent_Token_Usage'\r\n | where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n | extend process_id = tostring(customDimensions['process_id'])\r\n | summarize DocStartTime = min(timestamp) by process_id\r\n) on process_id\r\n| extend StepDurationSeconds = round(datetime_diff('millisecond', timestamp, DocStartTime) / 1000.0, 2)\r\n| summarize\r\n AvgCompletionTime = round(avg(StepDurationSeconds), 2),\r\n P50CompletionTime = round(percentile(StepDurationSeconds, 50), 2),\r\n P90CompletionTime = round(percentile(StepDurationSeconds, 90), 2),\r\n MaxCompletionTime = round(max(StepDurationSeconds), 2),\r\n Invocations = count()\r\n by Step = agent\r\n| order by AvgCompletionTime desc", "size": 0, "title": "Step Completion Time (seconds from doc start)", "timeContextFromParameter": "TimeRange", @@ -242,7 +242,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "dependencies\r\n| where timestamp > ago(7d)\r\n| where target has \"openai\" or name has \"chat\" or type == \"HTTP\" or name has \"openai\"\r\n| where success == true\r\n| extend durationSeconds = round(duration / 1000.0, 2)\r\n| summarize\r\n TotalCalls = count(),\r\n AvgSeconds = round(avg(durationSeconds), 2),\r\n P50Seconds = round(percentile(durationSeconds, 50), 2),\r\n P90Seconds = round(percentile(durationSeconds, 90), 2),\r\n MaxSeconds = round(max(durationSeconds), 2)\r\n by OperationName = name\r\n| order by TotalCalls desc\r\n| take 10", + "query": "dependencies\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| where target has \"openai\" or name has \"chat\" or type == \"HTTP\" or name has \"openai\"\r\n| where success == true\r\n| extend durationSeconds = round(duration / 1000.0, 2)\r\n| summarize\r\n TotalCalls = count(),\r\n AvgSeconds = round(avg(durationSeconds), 2),\r\n P50Seconds = round(percentile(durationSeconds, 50), 2),\r\n P90Seconds = round(percentile(durationSeconds, 90), 2),\r\n MaxSeconds = round(max(durationSeconds), 2)\r\n by OperationName = name\r\n| order by TotalCalls desc\r\n| take 10", "size": 0, "title": "OpenAI API Call Durations", "timeContextFromParameter": "TimeRange", @@ -256,7 +256,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| join kind=inner (\r\n customEvents\r\n | where name == 'LLM_Agent_Token_Usage'\r\n | where timestamp > ago(7d)\r\n | extend process_id = tostring(customDimensions['process_id'])\r\n | summarize DocStartTime = min(timestamp) by process_id\r\n) on process_id\r\n| extend StepCompletedAt = round(datetime_diff('millisecond', timestamp, DocStartTime) / 1000.0, 2)\r\n| project timestamp, process_id, Step=agent, StepCompletedAtSeconds=StepCompletedAt\r\n| order by process_id, timestamp asc", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| join kind=inner (\r\n customEvents\r\n | where name == 'LLM_Agent_Token_Usage'\r\n | where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n | extend process_id = tostring(customDimensions['process_id'])\r\n | summarize DocStartTime = min(timestamp) by process_id\r\n) on process_id\r\n| extend StepCompletedAt = round(datetime_diff('millisecond', timestamp, DocStartTime) / 1000.0, 2)\r\n| project timestamp, process_id, Step=agent, StepCompletedAtSeconds=StepCompletedAt\r\n| order by process_id, timestamp asc", "size": 0, "title": "Per-Document Step Timeline", "timeContextFromParameter": "TimeRange", @@ -270,7 +270,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| summarize StartTime = min(timestamp), EndTime = max(timestamp) by process_id\r\n| extend TotalSeconds = round(datetime_diff('millisecond', EndTime, StartTime) / 1000.0, 2)\r\n| summarize\r\n DocumentsProcessed = count(),\r\n AvgSeconds = round(avg(TotalSeconds), 2),\r\n P50Seconds = round(percentile(TotalSeconds, 50), 2),\r\n P90Seconds = round(percentile(TotalSeconds, 90), 2),\r\n MaxSeconds = round(max(TotalSeconds), 2)", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| summarize StartTime = min(timestamp), EndTime = max(timestamp) by process_id\r\n| extend TotalSeconds = round(datetime_diff('millisecond', EndTime, StartTime) / 1000.0, 2)\r\n| summarize\r\n DocumentsProcessed = count(),\r\n AvgSeconds = round(avg(TotalSeconds), 2),\r\n P50Seconds = round(percentile(TotalSeconds, 50), 2),\r\n P90Seconds = round(percentile(TotalSeconds, 90), 2),\r\n MaxSeconds = round(max(TotalSeconds), 2)", "size": 3, "title": "Total Document Processing Time (First to Last Step)", "timeContextFromParameter": "TimeRange", @@ -291,7 +291,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize total_tokens=sum(total_tokens) by process_id\r\n| summarize\r\n p50 = percentile(total_tokens, 50),\r\n p90 = percentile(total_tokens, 90),\r\n p95 = percentile(total_tokens, 95),\r\n p99 = percentile(total_tokens, 99),\r\n Max = max(total_tokens)", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize total_tokens=max(total_tokens) by agent, process_id\r\n| summarize total_tokens=sum(total_tokens) by process_id\r\n| summarize\r\n p50 = percentile(total_tokens, 50),\r\n p90 = percentile(total_tokens, 90),\r\n p95 = percentile(total_tokens, 95),\r\n p99 = percentile(total_tokens, 99),\r\n Max = max(total_tokens)", "size": 3, "title": "Token Usage Percentiles Per Document", "timeContextFromParameter": "TimeRange", @@ -313,7 +313,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize total_tokens=sum(total_tokens), timestamp=min(timestamp) by process_id\r\n| summarize\r\n DocumentsProcessed = count(),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDoc = round(avg(total_tokens), 0),\r\n MaxTokensPerDoc = max(total_tokens)\r\n by Day = bin(timestamp, 1d)\r\n| order by Day desc", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize total_tokens=max(total_tokens), timestamp=min(timestamp) by agent, process_id\r\n| summarize total_tokens=sum(total_tokens), timestamp=min(timestamp) by process_id\r\n| summarize\r\n DocumentsProcessed = count(),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDoc = round(avg(total_tokens), 0),\r\n MaxTokensPerDoc = max(total_tokens)\r\n by Day = bin(timestamp, 1d)\r\n| order by Day desc", "size": 0, "title": "Daily Processing Volume with Token Usage", "timeContextFromParameter": "TimeRange", @@ -325,11 +325,7 @@ } ], "isLocked": true, - "defaultResourceIds": [ - "/subscriptions/1d5876cd-7603-407a-96d2-ae5ca9a9c5f3/resourcegroups/rg-content-processing-token1/providers/microsoft.insights/components/appi-cptoken1kbxf2" - ], - "fallbackResourceIds": [ - "/subscriptions/1d5876cd-7603-407a-96d2-ae5ca9a9c5f3/resourcegroups/rg-content-processing-token1/providers/microsoft.insights/components/appi-cptoken1kbxf2" - ], + "defaultResourceIds": [], + "fallbackResourceIds": [], "fromTemplateId": "community-Workbooks/Common/Templates" } \ No newline at end of file diff --git a/infra/main.bicep b/infra/main.bicep index 9f4ec91e..1e732bbe 100644 --- a/infra/main.bicep +++ b/infra/main.bicep @@ -881,7 +881,12 @@ module avmContainerApp 'br/public:avm/res/app/container-app:0.22.1' = { environmentResourceId: avmContainerAppEnv.outputs.resourceId workloadProfileName: 'Consumption' enableTelemetry: enableTelemetry - registries: null + registries: [ + { + server: containerRegistryEndpoint + identity: avmContainerRegistryReader.outputs.resourceId + } + ] managedIdentities: { systemAssigned: true userAssignedResourceIds: [ @@ -950,7 +955,12 @@ module avmContainerApp_API 'br/public:avm/res/app/container-app:0.22.1' = { environmentResourceId: avmContainerAppEnv.outputs.resourceId workloadProfileName: 'Consumption' enableTelemetry: enableTelemetry - registries: null + registries: [ + { + server: containerRegistryEndpoint + identity: avmContainerRegistryReader.outputs.resourceId + } + ] tags: tags managedIdentities: { systemAssigned: true @@ -1082,7 +1092,12 @@ module avmContainerApp_Web 'br/public:avm/res/app/container-app:0.22.1' = { environmentResourceId: avmContainerAppEnv.outputs.resourceId workloadProfileName: 'Consumption' enableTelemetry: enableTelemetry - registries: null + registries: [ + { + server: containerRegistryEndpoint + identity: avmContainerRegistryReader.outputs.resourceId + } + ] tags: tags managedIdentities: { systemAssigned: true @@ -1165,7 +1180,12 @@ module avmContainerApp_Workflow 'br/public:avm/res/app/container-app:0.22.1' = { environmentResourceId: avmContainerAppEnv.outputs.resourceId workloadProfileName: 'Consumption' enableTelemetry: enableTelemetry - registries: null + registries: [ + { + server: containerRegistryEndpoint + identity: avmContainerRegistryReader.outputs.resourceId + } + ] tags: tags managedIdentities: { systemAssigned: true @@ -1543,7 +1563,12 @@ module avmContainerApp_update 'br/public:avm/res/app/container-app:0.22.1' = { enableTelemetry: enableTelemetry environmentResourceId: avmContainerAppEnv.outputs.resourceId workloadProfileName: 'Consumption' - registries: null + registries: [ + { + server: containerRegistryEndpoint + identity: avmContainerRegistryReader.outputs.resourceId + } + ] tags: tags managedIdentities: { systemAssigned: true @@ -1625,7 +1650,12 @@ module avmContainerApp_API_update 'br/public:avm/res/app/container-app:0.22.1' = enableTelemetry: enableTelemetry environmentResourceId: avmContainerAppEnv.outputs.resourceId workloadProfileName: 'Consumption' - registries: null + registries: [ + { + server: containerRegistryEndpoint + identity: avmContainerRegistryReader.outputs.resourceId + } + ] tags: tags managedIdentities: { systemAssigned: true @@ -1761,7 +1791,12 @@ module avmContainerApp_Workflow_update 'br/public:avm/res/app/container-app:0.22 enableTelemetry: enableTelemetry environmentResourceId: avmContainerAppEnv.outputs.resourceId workloadProfileName: 'Consumption' - registries: null + registries: [ + { + server: containerRegistryEndpoint + identity: avmContainerRegistryReader.outputs.resourceId + } + ] tags: tags managedIdentities: { systemAssigned: true diff --git a/infra/main.json b/infra/main.json index e8ba8e73..b1e3d0c2 100644 --- a/infra/main.json +++ b/infra/main.json @@ -6,7 +6,7 @@ "_generator": { "name": "bicep", "version": "0.42.1.51946", - "templateHash": "5885652317352749587" + "templateHash": "14320065740070986438" }, "name": "Content Processing Solution Accelerator", "description": "Bicep template to deploy the Content Processing Solution Accelerator with AVM compliance." @@ -243,8 +243,6 @@ "bastionHostName": "[format('bas-{0}', variables('solutionSuffix'))]", "jumpboxVmName": "[take(format('vm-{0}', variables('solutionSuffix')), 15)]", "dataCollectionRulesResourceName": "[format('dcr-{0}', variables('solutionSuffix'))]", - "logAnalyticsWorkspaceResourceName": "[format('log-{0}', variables('solutionSuffix'))]", - "dcrLogAnalyticsDestinationName": "[format('la-{0}-destination', variables('logAnalyticsWorkspaceResourceName'))]", "privateDnsZones": [ "privatelink.cognitiveservices.azure.com", "privatelink.openai.azure.com", @@ -14609,10 +14607,19 @@ { "name": "SecurityAuditEvents", "streams": [ - "Microsoft-Event" + "Microsoft-WindowsEvent" + ], + "eventLogName": "Security", + "eventTypes": [ + { + "eventType": "Audit Success" + }, + { + "eventType": "Audit Failure" + } ], "xPathQueries": [ - "Security!*[System[(band(Keywords,13510798882111488)) and (EventID != 4624)]]" + "Security!*[System[(EventID=4624 or EventID=4625)]]" ] } ] @@ -14621,7 +14628,7 @@ "logAnalytics": [ { "workspaceResourceId": "[reference('logAnalyticsWorkspace').outputs.resourceId.value]", - "name": "[variables('dcrLogAnalyticsDestinationName')]" + "name": "[format('la-{0}', variables('dataCollectionRulesResourceName'))]" } ] }, @@ -14631,20 +14638,10 @@ "Microsoft-Perf" ], "destinations": [ - "[variables('dcrLogAnalyticsDestinationName')]" + "[format('la-{0}', variables('dataCollectionRulesResourceName'))]" ], "transformKql": "source", "outputStream": "Microsoft-Perf" - }, - { - "streams": [ - "Microsoft-Event" - ], - "destinations": [ - "[variables('dcrLogAnalyticsDestinationName')]" - ], - "transformKql": "source", - "outputStream": "Microsoft-Event" } ] } @@ -19275,7 +19272,7 @@ "mode": "Incremental", "parameters": { "name": { - "value": "[variables('logAnalyticsWorkspaceResourceName')]" + "value": "[format('log-{0}', variables('solutionSuffix'))]" }, "location": { "value": "[parameters('location')]" @@ -28053,9 +28050,6 @@ "ipRules": [] } }, - "requireInfrastructureEncryption": { - "value": true - }, "supportsHttpsTrafficOnly": { "value": true }, @@ -36183,8 +36177,8 @@ "avmContainerApp_API", "avmContainerApp_Workflow", "avmManagedIdentity", - "[format('avmPrivateDnsZones[{0}]', variables('dnsZoneIndex').storageQueue)]", "[format('avmPrivateDnsZones[{0}]', variables('dnsZoneIndex').storageBlob)]", + "[format('avmPrivateDnsZones[{0}]', variables('dnsZoneIndex').storageQueue)]", "virtualNetwork" ] }, @@ -42488,9 +42482,9 @@ "dependsOn": [ "avmAiServices", "[format('avmPrivateDnsZones[{0}]', variables('dnsZoneIndex').contentUnderstanding)]", - "[format('avmPrivateDnsZones[{0}]', variables('dnsZoneIndex').openAI)]", - "[format('avmPrivateDnsZones[{0}]', variables('dnsZoneIndex').aiServices)]", "[format('avmPrivateDnsZones[{0}]', variables('dnsZoneIndex').cognitiveServices)]", + "[format('avmPrivateDnsZones[{0}]', variables('dnsZoneIndex').aiServices)]", + "[format('avmPrivateDnsZones[{0}]', variables('dnsZoneIndex').openAI)]", "virtualNetwork" ] }, @@ -44170,7 +44164,12 @@ "value": "[parameters('enableTelemetry')]" }, "registries": { - "value": null + "value": [ + { + "server": "[parameters('containerRegistryEndpoint')]", + "identity": "[reference('avmContainerRegistryReader').outputs.resourceId.value]" + } + ] }, "managedIdentities": { "value": { @@ -45801,7 +45800,12 @@ "value": "[parameters('enableTelemetry')]" }, "registries": { - "value": null + "value": [ + { + "server": "[parameters('containerRegistryEndpoint')]", + "identity": "[reference('avmContainerRegistryReader').outputs.resourceId.value]" + } + ] }, "tags": { "value": "[parameters('tags')]" @@ -45916,9 +45920,6 @@ "ingressTransport": { "value": "auto" }, - "ingressAllowInsecure": { - "value": false - }, "corsPolicy": { "value": { "allowedOrigins": [ @@ -47499,7 +47500,12 @@ "value": "[parameters('enableTelemetry')]" }, "registries": { - "value": null + "value": [ + { + "server": "[parameters('containerRegistryEndpoint')]", + "identity": "[reference('avmContainerRegistryReader').outputs.resourceId.value]" + } + ] }, "tags": { "value": "[parameters('tags')]" @@ -47524,9 +47530,6 @@ "ingressTransport": { "value": "auto" }, - "ingressAllowInsecure": { - "value": false - }, "scaleSettings": { "value": { "maxReplicas": "[if(parameters('enableScalability'), 3, 2)]", @@ -49150,7 +49153,12 @@ "value": "[parameters('enableTelemetry')]" }, "registries": { - "value": null + "value": [ + { + "server": "[parameters('containerRegistryEndpoint')]", + "identity": "[reference('avmContainerRegistryReader').outputs.resourceId.value]" + } + ] }, "tags": { "value": "[parameters('tags')]" @@ -50799,6 +50807,9 @@ "EnableMongo" ] }, + "enableAnalyticalStorage": { + "value": true + }, "defaultConsistencyLevel": { "value": "Session" }, @@ -61435,7 +61446,12 @@ "value": "Consumption" }, "registries": { - "value": null + "value": [ + { + "server": "[parameters('containerRegistryEndpoint')]", + "identity": "[reference('avmContainerRegistryReader').outputs.resourceId.value]" + } + ] }, "tags": { "value": "[parameters('tags')]" @@ -63069,7 +63085,12 @@ "value": "Consumption" }, "registries": { - "value": null + "value": [ + { + "server": "[parameters('containerRegistryEndpoint')]", + "identity": "[reference('avmContainerRegistryReader').outputs.resourceId.value]" + } + ] }, "tags": { "value": "[parameters('tags')]" @@ -63184,9 +63205,6 @@ "ingressTransport": { "value": "auto" }, - "ingressAllowInsecure": { - "value": false - }, "corsPolicy": { "value": { "allowedOrigins": [ @@ -64769,7 +64787,12 @@ "value": "Consumption" }, "registries": { - "value": null + "value": [ + { + "server": "[parameters('containerRegistryEndpoint')]", + "identity": "[reference('avmContainerRegistryReader').outputs.resourceId.value]" + } + ] }, "tags": { "value": "[parameters('tags')]" From d482a3ce35e1289b8942fd0679312207e603b184 Mon Sep 17 00:00:00 2001 From: Prachig-Microsoft Date: Mon, 18 May 2026 15:16:25 +0530 Subject: [PATCH 15/23] Enable monitoring (Application Insights) for non-WAF deployment Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- infra/main.parameters.json | 3 +++ 1 file changed, 3 insertions(+) diff --git a/infra/main.parameters.json b/infra/main.parameters.json index 44153d57..de82c938 100644 --- a/infra/main.parameters.json +++ b/infra/main.parameters.json @@ -34,6 +34,9 @@ }, "imageTag": { "value": "${AZURE_ENV_IMAGETAG=latest_v2}" + }, + "enableMonitoring": { + "value": true } } } \ No newline at end of file From 85e9f49a2b055ba13a8991a8021b2174ec9b1f39 Mon Sep 17 00:00:00 2001 From: Prachig-Microsoft Date: Mon, 18 May 2026 17:53:43 +0530 Subject: [PATCH 16/23] Fix workbook cross-resource queries for separate RG deployment Add crossComponentResources and defaultResourceIds to workbook JSON so it can query App Insights from a different resource group. Update Bicep to replace placeholder with actual resource ID at deploy time. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- infra/dashboards/deploy-workbook.bicep | 4 +- infra/dashboards/token-usage-workbook.json | 88 +++++++++++++++++----- 2 files changed, 73 insertions(+), 19 deletions(-) diff --git a/infra/dashboards/deploy-workbook.bicep b/infra/dashboards/deploy-workbook.bicep index 9d030a56..d44e10d1 100644 --- a/infra/dashboards/deploy-workbook.bicep +++ b/infra/dashboards/deploy-workbook.bicep @@ -10,6 +10,8 @@ param appInsightsResourceId string param location string = resourceGroup().location var workbookId = guid(resourceGroup().id, 'token-usage-workbook') +var workbookTemplate = loadTextContent('token-usage-workbook.json') +var workbookContent = replace(workbookTemplate, '__APP_INSIGHTS_RESOURCE_ID__', appInsightsResourceId) resource workbook 'Microsoft.Insights/workbooks@2022-04-01' = { name: workbookId @@ -19,7 +21,7 @@ resource workbook 'Microsoft.Insights/workbooks@2022-04-01' = { displayName: 'LLM Token Usage Dashboard' category: 'workbook' sourceId: appInsightsResourceId - serializedData: loadTextContent('token-usage-workbook.json') + serializedData: workbookContent } } diff --git a/infra/dashboards/token-usage-workbook.json b/infra/dashboards/token-usage-workbook.json index ad5e0688..dc3dd26c 100644 --- a/infra/dashboards/token-usage-workbook.json +++ b/infra/dashboards/token-usage-workbook.json @@ -73,7 +73,10 @@ "formatter": 1 } ] - } + }, + "crossComponentResources": [ + "__APP_INSIGHTS_RESOURCE_ID__" + ] }, "name": "overall-summary" }, @@ -87,7 +90,10 @@ "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", - "visualization": "table" + "visualization": "table", + "crossComponentResources": [ + "__APP_INSIGHTS_RESOURCE_ID__" + ] }, "name": "tokens-by-agent" }, @@ -101,7 +107,10 @@ "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", - "visualization": "areachart" + "visualization": "areachart", + "crossComponentResources": [ + "__APP_INSIGHTS_RESOURCE_ID__" + ] }, "name": "tokens-over-time" }, @@ -115,7 +124,10 @@ "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", - "visualization": "piechart" + "visualization": "piechart", + "crossComponentResources": [ + "__APP_INSIGHTS_RESOURCE_ID__" + ] }, "name": "token-distribution-pie" }, @@ -136,7 +148,10 @@ "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", - "visualization": "table" + "visualization": "table", + "crossComponentResources": [ + "__APP_INSIGHTS_RESOURCE_ID__" + ] }, "name": "cost-estimation" }, @@ -150,7 +165,10 @@ "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", - "visualization": "table" + "visualization": "table", + "crossComponentResources": [ + "__APP_INSIGHTS_RESOURCE_ID__" + ] }, "name": "cost-by-model" }, @@ -171,7 +189,10 @@ "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", - "visualization": "table" + "visualization": "table", + "crossComponentResources": [ + "__APP_INSIGHTS_RESOURCE_ID__" + ] }, "name": "tokens-by-model" }, @@ -185,7 +206,10 @@ "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", - "visualization": "table" + "visualization": "table", + "crossComponentResources": [ + "__APP_INSIGHTS_RESOURCE_ID__" + ] }, "name": "step-model-mapping" }, @@ -199,7 +223,10 @@ "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", - "visualization": "table" + "visualization": "table", + "crossComponentResources": [ + "__APP_INSIGHTS_RESOURCE_ID__" + ] }, "name": "top-consumers" }, @@ -213,7 +240,10 @@ "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", - "visualization": "table" + "visualization": "table", + "crossComponentResources": [ + "__APP_INSIGHTS_RESOURCE_ID__" + ] }, "name": "tokens-by-filetype" }, @@ -234,7 +264,10 @@ "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", - "visualization": "table" + "visualization": "table", + "crossComponentResources": [ + "__APP_INSIGHTS_RESOURCE_ID__" + ] }, "name": "processing-time-by-step" }, @@ -248,7 +281,10 @@ "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", - "visualization": "table" + "visualization": "table", + "crossComponentResources": [ + "__APP_INSIGHTS_RESOURCE_ID__" + ] }, "name": "openai-call-durations" }, @@ -262,7 +298,10 @@ "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", - "visualization": "table" + "visualization": "table", + "crossComponentResources": [ + "__APP_INSIGHTS_RESOURCE_ID__" + ] }, "name": "per-document-time-breakdown" }, @@ -276,7 +315,10 @@ "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", - "visualization": "table" + "visualization": "table", + "crossComponentResources": [ + "__APP_INSIGHTS_RESOURCE_ID__" + ] }, "name": "total-processing-time" }, @@ -305,7 +347,10 @@ "formatter": 1 } ] - } + }, + "crossComponentResources": [ + "__APP_INSIGHTS_RESOURCE_ID__" + ] }, "name": "token-percentiles" }, @@ -319,13 +364,20 @@ "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", - "visualization": "table" + "visualization": "table", + "crossComponentResources": [ + "__APP_INSIGHTS_RESOURCE_ID__" + ] }, "name": "daily-volume" } ], "isLocked": true, - "defaultResourceIds": [], - "fallbackResourceIds": [], + "defaultResourceIds": [ + "__APP_INSIGHTS_RESOURCE_ID__" + ], + "fallbackResourceIds": [ + "__APP_INSIGHTS_RESOURCE_ID__" + ], "fromTemplateId": "community-Workbooks/Common/Templates" } \ No newline at end of file From f84a617a62da758b7327a2e15b658bcc413c3965 Mon Sep 17 00:00:00 2001 From: Prachig-Microsoft Date: Wed, 20 May 2026 13:32:24 +0530 Subject: [PATCH 17/23] fix: workbook queries - dedup tokens, MapHandler-only doc count, inner join for file types - Token Usage by Model: use max() dedup per process_id to handle duplicate events - Overall Summary: count documents only from MapHandler process_ids - Token Usage by File Type: use inner join to exclude process_ids without mime type - Daily Processing Volume: count only MapHandler process_ids - Fix queryType to 0 (Azure Monitor Logs) and add TimeRange parameter Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- infra/dashboards/token-usage-workbook.json | 116 +++++++++++---------- 1 file changed, 61 insertions(+), 55 deletions(-) diff --git a/infra/dashboards/token-usage-workbook.json b/infra/dashboards/token-usage-workbook.json index dc3dd26c..65259e70 100644 --- a/infra/dashboards/token-usage-workbook.json +++ b/infra/dashboards/token-usage-workbook.json @@ -8,6 +8,13 @@ }, "name": "title" }, + { + "type": 1, + "content": { + "json": "---\n## Overview" + }, + "name": "section-overview" + }, { "type": 9, "content": { @@ -28,9 +35,15 @@ { "durationMs": 3600000 }, + { + "durationMs": 14400000 + }, { "durationMs": 86400000 }, + { + "durationMs": 259200000 + }, { "durationMs": 604800000 }, @@ -48,21 +61,13 @@ }, "name": "parameters" }, - { - "type": 1, - "content": { - "json": "---\n## Overview" - }, - "name": "section-overview" - }, { "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by agent, process_id\r\n| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), total_tokens=sum(total_tokens) by process_id\r\n| summarize\r\n TotalDocuments = count(),\r\n TotalInputTokens = sum(input_tokens),\r\n TotalOutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDocument = round(avg(total_tokens), 0)", + "query": "let docs = customEvents\n| where name == 'LLM_Agent_Token_Usage'\n| where timestamp > ago(7d)\n| where customDimensions['agent_name'] == 'MapHandler'\n| extend process_id = tostring(customDimensions['process_id'])\n| distinct process_id;\ncustomEvents\n| where name == 'LLM_Agent_Token_Usage'\n| where timestamp > ago(7d)\n| extend process_id = tostring(customDimensions['process_id'])\n| extend agent = tostring(customDimensions['agent_name'])\n| extend input_tokens = toint(customDimensions['input_tokens'])\n| extend output_tokens = toint(customDimensions['output_tokens'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by agent, process_id\n| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), total_tokens=sum(total_tokens) by process_id\n| summarize\n TotalDocuments = dcountif(process_id, process_id in (docs)),\n TotalInputTokens = sum(input_tokens),\n TotalOutputTokens = sum(output_tokens),\n TotalTokens = sum(total_tokens),\n AvgTokensPerDocument = round(avg(total_tokens), 0)", "size": 3, "title": "Overall Token Usage Summary", - "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", "visualization": "table", @@ -76,7 +81,8 @@ }, "crossComponentResources": [ "__APP_INSIGHTS_RESOURCE_ID__" - ] + ], + "timeContextFromParameter": "TimeRange" }, "name": "overall-summary" }, @@ -84,16 +90,16 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by agent, process_id\r\n| summarize\r\n InputTokens = sum(input_tokens),\r\n OutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n Invocations = count()\r\n by Step = agent\r\n| project Step, InputTokens, OutputTokens, TotalTokens, Invocations\r\n| order by TotalTokens desc", + "query": "customEvents\n| where name == 'LLM_Agent_Token_Usage'\n| where timestamp > ago(7d)\n| extend process_id = tostring(customDimensions['process_id'])\n| extend agent = tostring(customDimensions['agent_name'])\n| extend input_tokens = toint(customDimensions['input_tokens'])\n| extend output_tokens = toint(customDimensions['output_tokens'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by agent, process_id\n| summarize\n InputTokens = sum(input_tokens),\n OutputTokens = sum(output_tokens),\n TotalTokens = sum(total_tokens),\n Documents = dcount(process_id)\n by Step = agent\n| project Step, InputTokens, OutputTokens, TotalTokens, Documents\n| order by TotalTokens desc", "size": 0, "title": "Token Usage by Pipeline Step", - "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", "visualization": "table", "crossComponentResources": [ "__APP_INSIGHTS_RESOURCE_ID__" - ] + ], + "timeContextFromParameter": "TimeRange" }, "name": "tokens-by-agent" }, @@ -101,16 +107,16 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), timestamp=min(timestamp) by agent, process_id\r\n| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), timestamp=min(timestamp) by process_id\r\n| summarize InputTokens = sum(input_tokens), OutputTokens = sum(output_tokens) by bin(timestamp, 1h)\r\n| order by timestamp asc", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), timestamp=min(timestamp) by agent, process_id\r\n| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), timestamp=min(timestamp) by process_id\r\n| summarize InputTokens = sum(input_tokens), OutputTokens = sum(output_tokens) by bin(timestamp, 1h)\r\n| order by timestamp asc", "size": 0, "title": "Token Usage Over Time (Hourly)", - "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", "visualization": "areachart", "crossComponentResources": [ "__APP_INSIGHTS_RESOURCE_ID__" - ] + ], + "timeContextFromParameter": "TimeRange" }, "name": "tokens-over-time" }, @@ -118,16 +124,16 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize total_tokens=max(total_tokens) by agent, process_id\r\n| summarize TotalTokens = sum(total_tokens) by agent", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize total_tokens=max(total_tokens) by agent, process_id\r\n| summarize TotalTokens = sum(total_tokens) by agent", "size": 0, "title": "Token Distribution by Agent", - "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", "visualization": "piechart", "crossComponentResources": [ "__APP_INSIGHTS_RESOURCE_ID__" - ] + ], + "timeContextFromParameter": "TimeRange" }, "name": "token-distribution-pie" }, @@ -142,16 +148,16 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "let input_price_per_million = 2.50;\r\nlet output_price_per_million = 10.00;\r\ncustomEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), timestamp=min(timestamp) by agent, process_id\r\n| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), timestamp=min(timestamp) by process_id\r\n| summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by bin(timestamp, 1d)\r\n| extend InputCost = round(TotalInput * input_price_per_million / 1000000.0, 4)\r\n| extend OutputCost = round(TotalOutput * output_price_per_million / 1000000.0, 4)\r\n| extend TotalCost = InputCost + OutputCost\r\n| project Day = timestamp, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost\r\n| order by Day desc", + "query": "let input_price_per_million = 2.50;\r\nlet output_price_per_million = 10.00;\r\ncustomEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), timestamp=min(timestamp) by agent, process_id\r\n| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), timestamp=min(timestamp) by process_id\r\n| summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by bin(timestamp, 1d)\r\n| extend InputCost = round(TotalInput * input_price_per_million / 1000000.0, 4)\r\n| extend OutputCost = round(TotalOutput * output_price_per_million / 1000000.0, 4)\r\n| extend TotalCost = InputCost + OutputCost\r\n| project Day = timestamp, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost\r\n| order by Day desc", "size": 0, "title": "Estimated Daily Cost (GPT-4o Pricing: $2.50/1M input, $10.00/1M output)", - "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", "visualization": "table", "crossComponentResources": [ "__APP_INSIGHTS_RESOURCE_ID__" - ] + ], + "timeContextFromParameter": "TimeRange" }, "name": "cost-estimation" }, @@ -159,16 +165,16 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "let gpt4o_input = 2.50;\r\nlet gpt4o_output = 10.00;\r\nlet gpt4o_mini_input = 0.15;\r\nlet gpt4o_mini_output = 0.60;\r\ncustomEvents\r\n| where name == 'LLM_Model_Token_Usage'\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| extend model = tostring(customDimensions['model_deployment_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by model\r\n| extend InputPrice = case(model has \"mini\", gpt4o_mini_input, gpt4o_input)\r\n| extend OutputPrice = case(model has \"mini\", gpt4o_mini_output, gpt4o_output)\r\n| extend InputCost = round(TotalInput * InputPrice / 1000000.0, 4)\r\n| extend OutputCost = round(TotalOutput * OutputPrice / 1000000.0, 4)\r\n| extend TotalCost = InputCost + OutputCost\r\n| project Model = model, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost\r\n| order by TotalCost desc", + "query": "let gpt4o_input = 2.50;\r\nlet gpt4o_output = 10.00;\r\nlet gpt4o_mini_input = 0.15;\r\nlet gpt4o_mini_output = 0.60;\r\ncustomEvents\r\n| where name == 'LLM_Model_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend model = tostring(customDimensions['model_deployment_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by model\r\n| extend InputPrice = case(model has \"mini\", gpt4o_mini_input, gpt4o_input)\r\n| extend OutputPrice = case(model has \"mini\", gpt4o_mini_output, gpt4o_output)\r\n| extend InputCost = round(TotalInput * InputPrice / 1000000.0, 4)\r\n| extend OutputCost = round(TotalOutput * OutputPrice / 1000000.0, 4)\r\n| extend TotalCost = InputCost + OutputCost\r\n| project Model = model, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost\r\n| order by TotalCost desc", "size": 0, "title": "Estimated Cost by Model", - "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", "visualization": "table", "crossComponentResources": [ "__APP_INSIGHTS_RESOURCE_ID__" - ] + ], + "timeContextFromParameter": "TimeRange" }, "name": "cost-by-model" }, @@ -183,16 +189,16 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Model_Token_Usage'\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| extend model = tostring(customDimensions['model_deployment_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize\r\n InputTokens = sum(input_tokens),\r\n OutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n Invocations = count()\r\n by Model = model\r\n| order by TotalTokens desc", + "query": "customEvents\n| where name == 'LLM_Model_Token_Usage'\n| where timestamp > ago(7d)\n| extend process_id = tostring(customDimensions['process_id'])\n| extend model = tostring(customDimensions['model_deployment_name'])\n| extend input_tokens = toint(customDimensions['input_tokens'])\n| extend output_tokens = toint(customDimensions['output_tokens'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by process_id, model\n| summarize\n InputTokens = sum(input_tokens),\n OutputTokens = sum(output_tokens),\n TotalTokens = sum(total_tokens),\n Invocations = dcount(process_id)\n by Model = model\n| order by TotalTokens desc", "size": 0, "title": "Token Usage by Model", - "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", "visualization": "table", "crossComponentResources": [ "__APP_INSIGHTS_RESOURCE_ID__" - ] + ], + "timeContextFromParameter": "TimeRange" }, "name": "tokens-by-model" }, @@ -200,16 +206,16 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend model = tostring(customDimensions['model_deployment_name'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by agent, model, process_id\r\n| summarize\r\n InputTokens = sum(input_tokens),\r\n OutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n Invocations = dcount(process_id)\r\n by Step = agent, Model = model\r\n| order by TotalTokens desc", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend model = tostring(customDimensions['model_deployment_name'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by agent, model, process_id\r\n| summarize\r\n InputTokens = sum(input_tokens),\r\n OutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n Invocations = dcount(process_id)\r\n by Step = agent, Model = model\r\n| order by TotalTokens desc", "size": 0, "title": "Step-to-Model Token Mapping", - "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", "visualization": "table", "crossComponentResources": [ "__APP_INSIGHTS_RESOURCE_ID__" - ] + ], + "timeContextFromParameter": "TimeRange" }, "name": "step-model-mapping" }, @@ -217,16 +223,16 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize total_tokens=max(total_tokens) by agent, process_id\r\n| summarize TotalTokens = sum(total_tokens) by process_id\r\n| join kind=leftouter (\r\n customEvents\r\n | where name == 'LLM_Token_Usage_Summary'\r\n | where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n | extend process_id = tostring(customDimensions['process_id'])\r\n | extend file_name = tostring(customDimensions['file_name'])\r\n | summarize file_name=take_any(file_name) by process_id\r\n) on process_id\r\n| project process_id, file_name, TotalTokens\r\n| order by TotalTokens desc\r\n| take 20", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize total_tokens=max(total_tokens) by agent, process_id\r\n| summarize TotalTokens = sum(total_tokens) by process_id\r\n| join kind=leftouter (\r\n customEvents\r\n | where name == 'LLM_Token_Usage_Summary'\r\n | where timestamp > ago(7d)\r\n | extend process_id = tostring(customDimensions['process_id'])\r\n | extend file_name = tostring(customDimensions['file_name'])\r\n | summarize file_name=take_any(file_name) by process_id\r\n) on process_id\r\n| project process_id, file_name, TotalTokens\r\n| order by TotalTokens desc\r\n| take 20", "size": 0, "title": "Top 20 Token Consumers by Document", - "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", "visualization": "table", "crossComponentResources": [ "__APP_INSIGHTS_RESOURCE_ID__" - ] + ], + "timeContextFromParameter": "TimeRange" }, "name": "top-consumers" }, @@ -234,16 +240,16 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by agent, process_id\r\n| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), total_tokens=sum(total_tokens) by process_id\r\n| join kind=leftouter (\r\n customEvents\r\n | where name == 'LLM_Token_Usage_Summary'\r\n | where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n | extend process_id = tostring(customDimensions['process_id'])\r\n | extend mime_type = tostring(customDimensions['file_mime_type'])\r\n | summarize mime_type=take_any(mime_type) by process_id\r\n) on process_id\r\n| extend file_type = case(\r\n mime_type has \"pdf\", \"PDF\",\r\n mime_type has \"image\", \"Image\",\r\n mime_type has \"word\" or mime_type has \"docx\", \"Word\",\r\n mime_type has \"excel\" or mime_type has \"xlsx\", \"Excel\",\r\n mime_type has \"text\", \"Text\",\r\n \"Other\")\r\n| summarize\r\n Documents = count(),\r\n TotalInputTokens = sum(input_tokens),\r\n TotalOutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDoc = round(avg(total_tokens), 0)\r\n by FileType = file_type\r\n| order by TotalTokens desc", + "query": "customEvents\n| where name == 'LLM_Agent_Token_Usage'\n| where timestamp > ago(7d)\n| extend process_id = tostring(customDimensions['process_id'])\n| extend agent = tostring(customDimensions['agent_name'])\n| extend input_tokens = toint(customDimensions['input_tokens'])\n| extend output_tokens = toint(customDimensions['output_tokens'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by agent, process_id\n| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), total_tokens=sum(total_tokens) by process_id\n| join kind=inner (\n customEvents\n | where name == 'LLM_Token_Usage_Summary'\n | where timestamp > ago(7d)\n | extend process_id = tostring(customDimensions['process_id'])\n | extend mime_type = tostring(customDimensions['file_mime_type'])\n | summarize mime_type=take_any(mime_type) by process_id\n) on process_id\n| extend file_type = case(\n mime_type has \"pdf\", \"PDF\",\n mime_type has \"image\", \"Image\",\n mime_type has \"word\" or mime_type has \"docx\", \"Word\",\n mime_type has \"excel\" or mime_type has \"xlsx\", \"Excel\",\n mime_type has \"text\", \"Text\",\n \"Other\")\n| summarize\n Documents = count(),\n TotalInputTokens = sum(input_tokens),\n TotalOutputTokens = sum(output_tokens),\n TotalTokens = sum(total_tokens),\n AvgTokensPerDoc = round(avg(total_tokens), 0)\n by FileType = file_type\n| order by TotalTokens desc", "size": 0, "title": "Token Usage by File Type", - "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", "visualization": "table", "crossComponentResources": [ "__APP_INSIGHTS_RESOURCE_ID__" - ] + ], + "timeContextFromParameter": "TimeRange" }, "name": "tokens-by-filetype" }, @@ -258,16 +264,16 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| join kind=inner (\r\n customEvents\r\n | where name == 'LLM_Agent_Token_Usage'\r\n | where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n | extend process_id = tostring(customDimensions['process_id'])\r\n | summarize DocStartTime = min(timestamp) by process_id\r\n) on process_id\r\n| extend StepDurationSeconds = round(datetime_diff('millisecond', timestamp, DocStartTime) / 1000.0, 2)\r\n| summarize\r\n AvgCompletionTime = round(avg(StepDurationSeconds), 2),\r\n P50CompletionTime = round(percentile(StepDurationSeconds, 50), 2),\r\n P90CompletionTime = round(percentile(StepDurationSeconds, 90), 2),\r\n MaxCompletionTime = round(max(StepDurationSeconds), 2),\r\n Invocations = count()\r\n by Step = agent\r\n| order by AvgCompletionTime desc", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| join kind=inner (\r\n customEvents\r\n | where name == 'LLM_Agent_Token_Usage'\r\n | where timestamp > ago(7d)\r\n | extend process_id = tostring(customDimensions['process_id'])\r\n | summarize DocStartTime = min(timestamp) by process_id\r\n) on process_id\r\n| extend StepDurationSeconds = round(datetime_diff('millisecond', timestamp, DocStartTime) / 1000.0, 2)\r\n| summarize\r\n AvgCompletionTime = round(avg(StepDurationSeconds), 2),\r\n P50CompletionTime = round(percentile(StepDurationSeconds, 50), 2),\r\n P90CompletionTime = round(percentile(StepDurationSeconds, 90), 2),\r\n MaxCompletionTime = round(max(StepDurationSeconds), 2),\r\n Invocations = count()\r\n by Step = agent\r\n| order by AvgCompletionTime desc", "size": 0, "title": "Step Completion Time (seconds from doc start)", - "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", "visualization": "table", "crossComponentResources": [ "__APP_INSIGHTS_RESOURCE_ID__" - ] + ], + "timeContextFromParameter": "TimeRange" }, "name": "processing-time-by-step" }, @@ -275,16 +281,16 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "dependencies\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| where target has \"openai\" or name has \"chat\" or type == \"HTTP\" or name has \"openai\"\r\n| where success == true\r\n| extend durationSeconds = round(duration / 1000.0, 2)\r\n| summarize\r\n TotalCalls = count(),\r\n AvgSeconds = round(avg(durationSeconds), 2),\r\n P50Seconds = round(percentile(durationSeconds, 50), 2),\r\n P90Seconds = round(percentile(durationSeconds, 90), 2),\r\n MaxSeconds = round(max(durationSeconds), 2)\r\n by OperationName = name\r\n| order by TotalCalls desc\r\n| take 10", + "query": "dependencies\r\n| where timestamp > ago(7d)\r\n| where target has \"openai\" or name has \"chat\" or type == \"HTTP\" or name has \"openai\"\r\n| where success == true\r\n| extend durationSeconds = round(duration / 1000.0, 2)\r\n| summarize\r\n TotalCalls = count(),\r\n AvgSeconds = round(avg(durationSeconds), 2),\r\n P50Seconds = round(percentile(durationSeconds, 50), 2),\r\n P90Seconds = round(percentile(durationSeconds, 90), 2),\r\n MaxSeconds = round(max(durationSeconds), 2)\r\n by OperationName = name\r\n| order by TotalCalls desc\r\n| take 10", "size": 0, "title": "OpenAI API Call Durations", - "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", "visualization": "table", "crossComponentResources": [ "__APP_INSIGHTS_RESOURCE_ID__" - ] + ], + "timeContextFromParameter": "TimeRange" }, "name": "openai-call-durations" }, @@ -292,16 +298,16 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| join kind=inner (\r\n customEvents\r\n | where name == 'LLM_Agent_Token_Usage'\r\n | where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n | extend process_id = tostring(customDimensions['process_id'])\r\n | summarize DocStartTime = min(timestamp) by process_id\r\n) on process_id\r\n| extend StepCompletedAt = round(datetime_diff('millisecond', timestamp, DocStartTime) / 1000.0, 2)\r\n| project timestamp, process_id, Step=agent, StepCompletedAtSeconds=StepCompletedAt\r\n| order by process_id, timestamp asc", + "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| join kind=inner (\r\n customEvents\r\n | where name == 'LLM_Agent_Token_Usage'\r\n | where timestamp > ago(7d)\r\n | extend process_id = tostring(customDimensions['process_id'])\r\n | summarize DocStartTime = min(timestamp) by process_id\r\n) on process_id\r\n| extend StepCompletedAt = round(datetime_diff('millisecond', timestamp, DocStartTime) / 1000.0, 2)\r\n| project timestamp, process_id, Step=agent, StepCompletedAtSeconds=StepCompletedAt\r\n| order by process_id, timestamp asc", "size": 0, "title": "Per-Document Step Timeline", - "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", "visualization": "table", "crossComponentResources": [ "__APP_INSIGHTS_RESOURCE_ID__" - ] + ], + "timeContextFromParameter": "TimeRange" }, "name": "per-document-time-breakdown" }, @@ -309,16 +315,16 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| summarize StartTime = min(timestamp), EndTime = max(timestamp) by process_id\r\n| extend TotalSeconds = round(datetime_diff('millisecond', EndTime, StartTime) / 1000.0, 2)\r\n| summarize\r\n DocumentsProcessed = count(),\r\n AvgSeconds = round(avg(TotalSeconds), 2),\r\n P50Seconds = round(percentile(TotalSeconds, 50), 2),\r\n P90Seconds = round(percentile(TotalSeconds, 90), 2),\r\n MaxSeconds = round(max(TotalSeconds), 2)", + "query": "customEvents\n| where name == 'LLM_Agent_Token_Usage'\n| where timestamp > ago(7d)\n| where customDimensions['agent_name'] == 'MapHandler'\n| extend process_id = tostring(customDimensions['process_id'])\n| summarize StartTime = min(timestamp), EndTime = max(timestamp) by process_id\n| extend TotalSeconds = round(datetime_diff('millisecond', EndTime, StartTime) / 1000.0, 2)\n| summarize\n DocumentsProcessed = dcount(process_id),\n AvgSeconds = round(avg(TotalSeconds), 2),\n P50Seconds = round(percentile(TotalSeconds, 50), 2),\n P90Seconds = round(percentile(TotalSeconds, 90), 2),\n MaxSeconds = round(max(TotalSeconds), 2)", "size": 3, "title": "Total Document Processing Time (First to Last Step)", - "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", "visualization": "table", "crossComponentResources": [ "__APP_INSIGHTS_RESOURCE_ID__" - ] + ], + "timeContextFromParameter": "TimeRange" }, "name": "total-processing-time" }, @@ -333,10 +339,9 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize total_tokens=max(total_tokens) by agent, process_id\r\n| summarize total_tokens=sum(total_tokens) by process_id\r\n| summarize\r\n p50 = percentile(total_tokens, 50),\r\n p90 = percentile(total_tokens, 90),\r\n p95 = percentile(total_tokens, 95),\r\n p99 = percentile(total_tokens, 99),\r\n Max = max(total_tokens)", + "query": "customEvents\n| where name == 'LLM_Agent_Token_Usage'\n| where timestamp > ago(7d)\n| where customDimensions['agent_name'] == 'MapHandler'\n| extend process_id = tostring(customDimensions['process_id'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize total_tokens=max(total_tokens) by process_id\n| summarize\n p50 = round(percentile(total_tokens, 50), 0),\n p90 = round(percentile(total_tokens, 90), 0),\n p95 = round(percentile(total_tokens, 95), 0),\n p99 = round(percentile(total_tokens, 99), 0),\n Max = max(total_tokens)", "size": 3, "title": "Token Usage Percentiles Per Document", - "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", "visualization": "table", @@ -350,7 +355,8 @@ }, "crossComponentResources": [ "__APP_INSIGHTS_RESOURCE_ID__" - ] + ], + "timeContextFromParameter": "TimeRange" }, "name": "token-percentiles" }, @@ -358,16 +364,16 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp between (todatetime('{TimeRange:start}') .. todatetime('{TimeRange:end}'))\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize total_tokens=max(total_tokens), timestamp=min(timestamp) by agent, process_id\r\n| summarize total_tokens=sum(total_tokens), timestamp=min(timestamp) by process_id\r\n| summarize\r\n DocumentsProcessed = count(),\r\n TotalTokens = sum(total_tokens),\r\n AvgTokensPerDoc = round(avg(total_tokens), 0),\r\n MaxTokensPerDoc = max(total_tokens)\r\n by Day = bin(timestamp, 1d)\r\n| order by Day desc", + "query": "let docs = customEvents\n| where name == 'LLM_Agent_Token_Usage'\n| where timestamp > ago(7d)\n| where customDimensions['agent_name'] == 'MapHandler'\n| extend process_id = tostring(customDimensions['process_id'])\n| distinct process_id;\ncustomEvents\n| where name == 'LLM_Agent_Token_Usage'\n| where timestamp > ago(7d)\n| extend process_id = tostring(customDimensions['process_id'])\n| extend agent = tostring(customDimensions['agent_name'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize total_tokens=max(total_tokens) by agent, process_id, bin(timestamp, 1d)\n| summarize total_tokens=sum(total_tokens) by process_id, Day=bin(timestamp, 1d)\n| summarize\n DocumentsProcessed = dcountif(process_id, process_id in (docs)),\n TotalTokens = sum(total_tokens),\n AvgTokensPerDoc = round(avg(total_tokens), 0),\n MaxTokensPerDoc = max(total_tokens)\n by Day", "size": 0, "title": "Daily Processing Volume with Token Usage", - "timeContextFromParameter": "TimeRange", "queryType": 0, "resourceType": "microsoft.insights/components", "visualization": "table", "crossComponentResources": [ "__APP_INSIGHTS_RESOURCE_ID__" - ] + ], + "timeContextFromParameter": "TimeRange" }, "name": "daily-volume" } From 6e04499f14e11ae11555598e8dc2ad72ce338e6c Mon Sep 17 00:00:00 2001 From: Prachig-Microsoft Date: Mon, 1 Jun 2026 15:19:14 +0530 Subject: [PATCH 18/23] refactor: replace workbook-specific token tracking with generic telemetry logging - Remove Azure Workbook infrastructure (bicep, KQL, JSON) - Replace token_usage_utils with llm_token_telemetry and telemetry modules - Update pipeline handlers and workflow executors to use new telemetry - Update unit tests for new module structure Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- infra/dashboards/deploy-workbook.bicep | 29 - infra/dashboards/token-usage-queries.kql | 320 ------ infra/dashboards/token-usage-workbook.json | 389 ------- .../src/libs/llm_token_telemetry.py | 990 ++++++++++++++++++ .../src/libs/pipeline/handlers/map_handler.py | 11 +- .../libs/pipeline/handlers/save_handler.py | 25 - src/ContentProcessor/src/libs/telemetry.py | 92 ++ .../src/libs/token_usage_utils.py | 245 ----- .../tests/unit/libs/test_token_usage_utils.py | 368 ++++--- .../src/libs/llm_token_telemetry.py | 990 ++++++++++++++++++ .../src/libs/telemetry.py | 92 ++ .../src/libs/token_usage_utils.py | 245 ----- .../gap_analysis/executor/gap_executor.py | 11 +- .../src/steps/rai/executor/rai_executor.py | 11 +- .../summarize/executor/summarize_executor.py | 11 +- 15 files changed, 2435 insertions(+), 1394 deletions(-) delete mode 100644 infra/dashboards/deploy-workbook.bicep delete mode 100644 infra/dashboards/token-usage-queries.kql delete mode 100644 infra/dashboards/token-usage-workbook.json create mode 100644 src/ContentProcessor/src/libs/llm_token_telemetry.py create mode 100644 src/ContentProcessor/src/libs/telemetry.py delete mode 100644 src/ContentProcessor/src/libs/token_usage_utils.py create mode 100644 src/ContentProcessorWorkflow/src/libs/llm_token_telemetry.py create mode 100644 src/ContentProcessorWorkflow/src/libs/telemetry.py delete mode 100644 src/ContentProcessorWorkflow/src/libs/token_usage_utils.py diff --git a/infra/dashboards/deploy-workbook.bicep b/infra/dashboards/deploy-workbook.bicep deleted file mode 100644 index d44e10d1..00000000 --- a/infra/dashboards/deploy-workbook.bicep +++ /dev/null @@ -1,29 +0,0 @@ -// Standalone deployment for LLM Token Usage Workbook -// Connects to an existing Application Insights instance from any content processing RG - -targetScope = 'resourceGroup' - -@description('Full resource ID of the Application Insights instance to query.') -param appInsightsResourceId string - -@description('Azure region for the workbook resource.') -param location string = resourceGroup().location - -var workbookId = guid(resourceGroup().id, 'token-usage-workbook') -var workbookTemplate = loadTextContent('token-usage-workbook.json') -var workbookContent = replace(workbookTemplate, '__APP_INSIGHTS_RESOURCE_ID__', appInsightsResourceId) - -resource workbook 'Microsoft.Insights/workbooks@2022-04-01' = { - name: workbookId - location: location - kind: 'shared' - properties: { - displayName: 'LLM Token Usage Dashboard' - category: 'workbook' - sourceId: appInsightsResourceId - serializedData: workbookContent - } -} - -output workbookName string = workbook.name -output workbookId string = workbook.id diff --git a/infra/dashboards/token-usage-queries.kql b/infra/dashboards/token-usage-queries.kql deleted file mode 100644 index c5070943..00000000 --- a/infra/dashboards/token-usage-queries.kql +++ /dev/null @@ -1,320 +0,0 @@ -// ============================================================ -// KQL Queries for LLM Token Usage Monitoring -// Content Processing Solution Accelerator -// Run these in Application Insights > Logs -// -// IMPORTANT: All queries use a deduplication pattern: -// max() by (agent, process_id) first, then sum() -// This handles duplicate custom events that can occur when -// both ContentProcessor and ContentProcessorWorkflow emit -// telemetry through the same Application Insights instance. -// ============================================================ - -// 1. Overall token usage summary -// Dedup: max by (agent, process_id) → sum by process_id → aggregate -customEvents -| where name == 'LLM_Agent_Token_Usage' -| where timestamp > ago(7d) -| extend process_id = tostring(customDimensions['process_id']) -| extend agent = tostring(customDimensions['agent_name']) -| extend input_tokens = toint(customDimensions['input_tokens']) -| extend output_tokens = toint(customDimensions['output_tokens']) -| extend total_tokens = toint(customDimensions['total_tokens']) -| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by agent, process_id -| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), total_tokens=sum(total_tokens) by process_id -| summarize - TotalDocuments = count(), - TotalInputTokens = sum(input_tokens), - TotalOutputTokens = sum(output_tokens), - TotalTokens = sum(total_tokens), - AvgTokensPerDocument = round(avg(total_tokens), 0) - -// 2. Token usage by pipeline step (agent) -// Dedup: max by (agent, process_id) → sum by agent -customEvents -| where name == 'LLM_Agent_Token_Usage' -| where timestamp > ago(7d) -| extend agent = tostring(customDimensions['agent_name']) -| extend process_id = tostring(customDimensions['process_id']) -| extend input_tokens = toint(customDimensions['input_tokens']) -| extend output_tokens = toint(customDimensions['output_tokens']) -| extend total_tokens = toint(customDimensions['total_tokens']) -| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by agent, process_id -| summarize - InputTokens = sum(input_tokens), - OutputTokens = sum(output_tokens), - TotalTokens = sum(total_tokens), - Invocations = count() - by Step = agent -| project Step, InputTokens, OutputTokens, TotalTokens, Invocations -| order by TotalTokens desc - -// 3. Token usage over time (hourly) -customEvents -| where name == 'LLM_Agent_Token_Usage' -| where timestamp > ago(7d) -| extend process_id = tostring(customDimensions['process_id']) -| extend agent = tostring(customDimensions['agent_name']) -| extend input_tokens = toint(customDimensions['input_tokens']) -| extend output_tokens = toint(customDimensions['output_tokens']) -| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), timestamp=min(timestamp) by agent, process_id -| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), timestamp=min(timestamp) by process_id -| summarize InputTokens = sum(input_tokens), OutputTokens = sum(output_tokens) by bin(timestamp, 1h) -| order by timestamp asc -| render areachart - -// 4. Token distribution by agent (pie chart) -customEvents -| where name == 'LLM_Agent_Token_Usage' -| where timestamp > ago(7d) -| extend agent = tostring(customDimensions['agent_name']) -| extend process_id = tostring(customDimensions['process_id']) -| extend total_tokens = toint(customDimensions['total_tokens']) -| summarize total_tokens=max(total_tokens) by agent, process_id -| summarize TotalTokens = sum(total_tokens) by agent -| render piechart - -// 5. Estimated daily cost (GPT-4o pricing: $2.50/1M input, $10.00/1M output) -let input_price_per_million = 2.50; -let output_price_per_million = 10.00; -customEvents -| where name == 'LLM_Agent_Token_Usage' -| where timestamp > ago(30d) -| extend process_id = tostring(customDimensions['process_id']) -| extend agent = tostring(customDimensions['agent_name']) -| extend input_tokens = toint(customDimensions['input_tokens']) -| extend output_tokens = toint(customDimensions['output_tokens']) -| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), timestamp=min(timestamp) by agent, process_id -| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), timestamp=min(timestamp) by process_id -| summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by bin(timestamp, 1d) -| extend InputCost = round(TotalInput * input_price_per_million / 1000000.0, 4) -| extend OutputCost = round(TotalOutput * output_price_per_million / 1000000.0, 4) -| extend TotalCost = InputCost + OutputCost -| project Day = timestamp, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost -| order by Day desc - -// 6. Estimated cost by model (adjust pricing per model) -let gpt4o_input = 2.50; -let gpt4o_output = 10.00; -let gpt4o_mini_input = 0.15; -let gpt4o_mini_output = 0.60; -customEvents -| where name == 'LLM_Model_Token_Usage' -| where timestamp > ago(30d) -| extend model = tostring(customDimensions['model_deployment_name']) -| extend input_tokens = toint(customDimensions['input_tokens']) -| extend output_tokens = toint(customDimensions['output_tokens']) -| summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by model -| extend InputPrice = case( - model has "mini", gpt4o_mini_input, - gpt4o_input) -| extend OutputPrice = case( - model has "mini", gpt4o_mini_output, - gpt4o_output) -| extend InputCost = round(TotalInput * InputPrice / 1000000.0, 4) -| extend OutputCost = round(TotalOutput * OutputPrice / 1000000.0, 4) -| extend TotalCost = InputCost + OutputCost -| project Model = model, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost -| order by TotalCost desc - -// 7. Token usage by model deployment -customEvents -| where name == 'LLM_Model_Token_Usage' -| where timestamp > ago(7d) -| extend model = tostring(customDimensions['model_deployment_name']) -| extend input_tokens = toint(customDimensions['input_tokens']) -| extend output_tokens = toint(customDimensions['output_tokens']) -| extend total_tokens = toint(customDimensions['total_tokens']) -| summarize - InputTokens = sum(input_tokens), - OutputTokens = sum(output_tokens), - TotalTokens = sum(total_tokens), - Invocations = count() - by Model = model -| order by TotalTokens desc - -// 8. Step-to-model mapping with token usage -customEvents -| where name == 'LLM_Agent_Token_Usage' -| where timestamp > ago(7d) -| extend agent = tostring(customDimensions['agent_name']) -| extend model = tostring(customDimensions['model_deployment_name']) -| extend process_id = tostring(customDimensions['process_id']) -| extend input_tokens = toint(customDimensions['input_tokens']) -| extend output_tokens = toint(customDimensions['output_tokens']) -| extend total_tokens = toint(customDimensions['total_tokens']) -| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by agent, model, process_id -| summarize - InputTokens = sum(input_tokens), - OutputTokens = sum(output_tokens), - TotalTokens = sum(total_tokens), - Invocations = dcount(process_id) - by Step = agent, Model = model -| order by TotalTokens desc - -// 9. Top 20 token consumers by document -customEvents -| where name == 'LLM_Agent_Token_Usage' -| where timestamp > ago(7d) -| extend process_id = tostring(customDimensions['process_id']) -| extend agent = tostring(customDimensions['agent_name']) -| extend total_tokens = toint(customDimensions['total_tokens']) -| summarize total_tokens=max(total_tokens) by agent, process_id -| summarize TotalTokens = sum(total_tokens) by process_id -| join kind=leftouter ( - customEvents - | where name == 'LLM_Token_Usage_Summary' - | where timestamp > ago(7d) - | extend process_id = tostring(customDimensions['process_id']) - | extend file_name = tostring(customDimensions['file_name']) - | summarize file_name=take_any(file_name) by process_id -) on process_id -| project process_id, file_name, TotalTokens -| order by TotalTokens desc -| take 20 - -// 10. Token usage by file type (PDF, DOCX, image, etc.) -customEvents -| where name == 'LLM_Agent_Token_Usage' -| where timestamp > ago(7d) -| extend process_id = tostring(customDimensions['process_id']) -| extend agent = tostring(customDimensions['agent_name']) -| extend input_tokens = toint(customDimensions['input_tokens']) -| extend output_tokens = toint(customDimensions['output_tokens']) -| extend total_tokens = toint(customDimensions['total_tokens']) -| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by agent, process_id -| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), total_tokens=sum(total_tokens) by process_id -| join kind=leftouter ( - customEvents - | where name == 'LLM_Token_Usage_Summary' - | where timestamp > ago(7d) - | extend process_id = tostring(customDimensions['process_id']) - | extend mime_type = tostring(customDimensions['file_mime_type']) - | summarize mime_type=take_any(mime_type) by process_id -) on process_id -| extend file_type = case( - mime_type has "pdf", "PDF", - mime_type has "image", "Image", - mime_type has "word" or mime_type has "docx", "Word", - mime_type has "excel" or mime_type has "xlsx", "Excel", - mime_type has "text", "Text", - "Other") -| summarize - Documents = count(), - TotalInputTokens = sum(input_tokens), - TotalOutputTokens = sum(output_tokens), - TotalTokens = sum(total_tokens), - AvgTokensPerDoc = round(avg(total_tokens), 0) - by FileType = file_type -| order by TotalTokens desc - -// ============================================================ -// Processing Time Queries -// ============================================================ - -// 11. Step completion time (seconds from document start to step completion) -customEvents -| where name == 'LLM_Agent_Token_Usage' -| where timestamp > ago(7d) -| extend agent = tostring(customDimensions['agent_name']) -| extend process_id = tostring(customDimensions['process_id']) -| join kind=inner ( - customEvents - | where name == 'LLM_Agent_Token_Usage' - | where timestamp > ago(7d) - | extend process_id = tostring(customDimensions['process_id']) - | summarize DocStartTime = min(timestamp) by process_id -) on process_id -| extend StepDurationSeconds = round(datetime_diff('millisecond', timestamp, DocStartTime) / 1000.0, 2) -| summarize - AvgCompletionTime = round(avg(StepDurationSeconds), 2), - P50CompletionTime = round(percentile(StepDurationSeconds, 50), 2), - P90CompletionTime = round(percentile(StepDurationSeconds, 90), 2), - MaxCompletionTime = round(max(StepDurationSeconds), 2), - Invocations = count() - by Step = agent -| order by AvgCompletionTime desc - -// 12. OpenAI API call durations from dependencies table -dependencies -| where timestamp > ago(7d) -| where target has "openai" or name has "chat" or type == "HTTP" or name has "openai" -| where success == true -| extend durationSeconds = round(duration / 1000.0, 2) -| summarize - TotalCalls = count(), - AvgSeconds = round(avg(durationSeconds), 2), - P50Seconds = round(percentile(durationSeconds, 50), 2), - P90Seconds = round(percentile(durationSeconds, 90), 2), - MaxSeconds = round(max(durationSeconds), 2) - by OperationName = name -| order by TotalCalls desc -| take 10 - -// 13. Per-document step timeline -customEvents -| where name == 'LLM_Agent_Token_Usage' -| where timestamp > ago(7d) -| extend agent = tostring(customDimensions['agent_name']) -| extend process_id = tostring(customDimensions['process_id']) -| join kind=inner ( - customEvents - | where name == 'LLM_Agent_Token_Usage' - | where timestamp > ago(7d) - | extend process_id = tostring(customDimensions['process_id']) - | summarize DocStartTime = min(timestamp) by process_id -) on process_id -| extend StepCompletedAt = round(datetime_diff('millisecond', timestamp, DocStartTime) / 1000.0, 2) -| project timestamp, process_id, Step=agent, StepCompletedAtSeconds=StepCompletedAt -| order by process_id, timestamp asc - -// 14. Total document processing time (first to last step) -customEvents -| where name == 'LLM_Agent_Token_Usage' -| where timestamp > ago(7d) -| extend process_id = tostring(customDimensions['process_id']) -| summarize StartTime = min(timestamp), EndTime = max(timestamp) by process_id -| extend TotalSeconds = round(datetime_diff('millisecond', EndTime, StartTime) / 1000.0, 2) -| summarize - DocumentsProcessed = count(), - AvgSeconds = round(avg(TotalSeconds), 2), - P50Seconds = round(percentile(TotalSeconds, 50), 2), - P90Seconds = round(percentile(TotalSeconds, 90), 2), - MaxSeconds = round(max(TotalSeconds), 2) - -// ============================================================ -// Percentiles & Trends -// ============================================================ - -// 15. Token usage percentiles per document -customEvents -| where name == 'LLM_Agent_Token_Usage' -| where timestamp > ago(7d) -| extend process_id = tostring(customDimensions['process_id']) -| extend agent = tostring(customDimensions['agent_name']) -| extend total_tokens = toint(customDimensions['total_tokens']) -| summarize total_tokens=max(total_tokens) by agent, process_id -| summarize total_tokens=sum(total_tokens) by process_id -| summarize - p50 = percentile(total_tokens, 50), - p90 = percentile(total_tokens, 90), - p95 = percentile(total_tokens, 95), - p99 = percentile(total_tokens, 99), - Max = max(total_tokens) - -// 16. Daily processing volume with token usage -customEvents -| where name == 'LLM_Agent_Token_Usage' -| where timestamp > ago(30d) -| extend process_id = tostring(customDimensions['process_id']) -| extend agent = tostring(customDimensions['agent_name']) -| extend total_tokens = toint(customDimensions['total_tokens']) -| summarize total_tokens=max(total_tokens), timestamp=min(timestamp) by agent, process_id -| summarize total_tokens=sum(total_tokens), timestamp=min(timestamp) by process_id -| summarize - DocumentsProcessed = count(), - TotalTokens = sum(total_tokens), - AvgTokensPerDoc = round(avg(total_tokens), 0), - MaxTokensPerDoc = max(total_tokens) - by Day = bin(timestamp, 1d) -| order by Day desc diff --git a/infra/dashboards/token-usage-workbook.json b/infra/dashboards/token-usage-workbook.json deleted file mode 100644 index 65259e70..00000000 --- a/infra/dashboards/token-usage-workbook.json +++ /dev/null @@ -1,389 +0,0 @@ -{ - "version": "Notebook/1.0", - "items": [ - { - "type": 1, - "content": { - "json": "# LLM Token Usage Dashboard\n\nMonitors token consumption across all pipeline agents in Content Processing Solution Accelerator." - }, - "name": "title" - }, - { - "type": 1, - "content": { - "json": "---\n## Overview" - }, - "name": "section-overview" - }, - { - "type": 9, - "content": { - "version": "KqlParameterItem/1.0", - "parameters": [ - { - "id": "a0b1c2d3-e4f5-6789-abcd-ef0123456789", - "version": "KqlParameterItem/1.0", - "name": "TimeRange", - "label": "Time Range", - "type": 4, - "isRequired": true, - "value": { - "durationMs": 604800000 - }, - "typeSettings": { - "selectableValues": [ - { - "durationMs": 3600000 - }, - { - "durationMs": 14400000 - }, - { - "durationMs": 86400000 - }, - { - "durationMs": 259200000 - }, - { - "durationMs": 604800000 - }, - { - "durationMs": 2592000000 - } - ], - "allowCustom": true - } - } - ], - "style": "pills", - "queryType": 0, - "resourceType": "microsoft.insights/components" - }, - "name": "parameters" - }, - { - "type": 3, - "content": { - "version": "KqlItem/1.0", - "query": "let docs = customEvents\n| where name == 'LLM_Agent_Token_Usage'\n| where timestamp > ago(7d)\n| where customDimensions['agent_name'] == 'MapHandler'\n| extend process_id = tostring(customDimensions['process_id'])\n| distinct process_id;\ncustomEvents\n| where name == 'LLM_Agent_Token_Usage'\n| where timestamp > ago(7d)\n| extend process_id = tostring(customDimensions['process_id'])\n| extend agent = tostring(customDimensions['agent_name'])\n| extend input_tokens = toint(customDimensions['input_tokens'])\n| extend output_tokens = toint(customDimensions['output_tokens'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by agent, process_id\n| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), total_tokens=sum(total_tokens) by process_id\n| summarize\n TotalDocuments = dcountif(process_id, process_id in (docs)),\n TotalInputTokens = sum(input_tokens),\n TotalOutputTokens = sum(output_tokens),\n TotalTokens = sum(total_tokens),\n AvgTokensPerDocument = round(avg(total_tokens), 0)", - "size": 3, - "title": "Overall Token Usage Summary", - "queryType": 0, - "resourceType": "microsoft.insights/components", - "visualization": "table", - "gridSettings": { - "formatters": [ - { - "columnMatch": "TotalTokens", - "formatter": 1 - } - ] - }, - "crossComponentResources": [ - "__APP_INSIGHTS_RESOURCE_ID__" - ], - "timeContextFromParameter": "TimeRange" - }, - "name": "overall-summary" - }, - { - "type": 3, - "content": { - "version": "KqlItem/1.0", - "query": "customEvents\n| where name == 'LLM_Agent_Token_Usage'\n| where timestamp > ago(7d)\n| extend process_id = tostring(customDimensions['process_id'])\n| extend agent = tostring(customDimensions['agent_name'])\n| extend input_tokens = toint(customDimensions['input_tokens'])\n| extend output_tokens = toint(customDimensions['output_tokens'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by agent, process_id\n| summarize\n InputTokens = sum(input_tokens),\n OutputTokens = sum(output_tokens),\n TotalTokens = sum(total_tokens),\n Documents = dcount(process_id)\n by Step = agent\n| project Step, InputTokens, OutputTokens, TotalTokens, Documents\n| order by TotalTokens desc", - "size": 0, - "title": "Token Usage by Pipeline Step", - "queryType": 0, - "resourceType": "microsoft.insights/components", - "visualization": "table", - "crossComponentResources": [ - "__APP_INSIGHTS_RESOURCE_ID__" - ], - "timeContextFromParameter": "TimeRange" - }, - "name": "tokens-by-agent" - }, - { - "type": 3, - "content": { - "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), timestamp=min(timestamp) by agent, process_id\r\n| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), timestamp=min(timestamp) by process_id\r\n| summarize InputTokens = sum(input_tokens), OutputTokens = sum(output_tokens) by bin(timestamp, 1h)\r\n| order by timestamp asc", - "size": 0, - "title": "Token Usage Over Time (Hourly)", - "queryType": 0, - "resourceType": "microsoft.insights/components", - "visualization": "areachart", - "crossComponentResources": [ - "__APP_INSIGHTS_RESOURCE_ID__" - ], - "timeContextFromParameter": "TimeRange" - }, - "name": "tokens-over-time" - }, - { - "type": 3, - "content": { - "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize total_tokens=max(total_tokens) by agent, process_id\r\n| summarize TotalTokens = sum(total_tokens) by agent", - "size": 0, - "title": "Token Distribution by Agent", - "queryType": 0, - "resourceType": "microsoft.insights/components", - "visualization": "piechart", - "crossComponentResources": [ - "__APP_INSIGHTS_RESOURCE_ID__" - ], - "timeContextFromParameter": "TimeRange" - }, - "name": "token-distribution-pie" - }, - { - "type": 1, - "content": { - "json": "---\n## Cost Estimation" - }, - "name": "section-cost" - }, - { - "type": 3, - "content": { - "version": "KqlItem/1.0", - "query": "let input_price_per_million = 2.50;\r\nlet output_price_per_million = 10.00;\r\ncustomEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), timestamp=min(timestamp) by agent, process_id\r\n| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), timestamp=min(timestamp) by process_id\r\n| summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by bin(timestamp, 1d)\r\n| extend InputCost = round(TotalInput * input_price_per_million / 1000000.0, 4)\r\n| extend OutputCost = round(TotalOutput * output_price_per_million / 1000000.0, 4)\r\n| extend TotalCost = InputCost + OutputCost\r\n| project Day = timestamp, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost\r\n| order by Day desc", - "size": 0, - "title": "Estimated Daily Cost (GPT-4o Pricing: $2.50/1M input, $10.00/1M output)", - "queryType": 0, - "resourceType": "microsoft.insights/components", - "visualization": "table", - "crossComponentResources": [ - "__APP_INSIGHTS_RESOURCE_ID__" - ], - "timeContextFromParameter": "TimeRange" - }, - "name": "cost-estimation" - }, - { - "type": 3, - "content": { - "version": "KqlItem/1.0", - "query": "let gpt4o_input = 2.50;\r\nlet gpt4o_output = 10.00;\r\nlet gpt4o_mini_input = 0.15;\r\nlet gpt4o_mini_output = 0.60;\r\ncustomEvents\r\n| where name == 'LLM_Model_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend model = tostring(customDimensions['model_deployment_name'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| summarize TotalInput = sum(input_tokens), TotalOutput = sum(output_tokens) by model\r\n| extend InputPrice = case(model has \"mini\", gpt4o_mini_input, gpt4o_input)\r\n| extend OutputPrice = case(model has \"mini\", gpt4o_mini_output, gpt4o_output)\r\n| extend InputCost = round(TotalInput * InputPrice / 1000000.0, 4)\r\n| extend OutputCost = round(TotalOutput * OutputPrice / 1000000.0, 4)\r\n| extend TotalCost = InputCost + OutputCost\r\n| project Model = model, TotalInput, TotalOutput, InputCost, OutputCost, TotalCost\r\n| order by TotalCost desc", - "size": 0, - "title": "Estimated Cost by Model", - "queryType": 0, - "resourceType": "microsoft.insights/components", - "visualization": "table", - "crossComponentResources": [ - "__APP_INSIGHTS_RESOURCE_ID__" - ], - "timeContextFromParameter": "TimeRange" - }, - "name": "cost-by-model" - }, - { - "type": 1, - "content": { - "json": "---\n## Model & Document Details" - }, - "name": "section-details" - }, - { - "type": 3, - "content": { - "version": "KqlItem/1.0", - "query": "customEvents\n| where name == 'LLM_Model_Token_Usage'\n| where timestamp > ago(7d)\n| extend process_id = tostring(customDimensions['process_id'])\n| extend model = tostring(customDimensions['model_deployment_name'])\n| extend input_tokens = toint(customDimensions['input_tokens'])\n| extend output_tokens = toint(customDimensions['output_tokens'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by process_id, model\n| summarize\n InputTokens = sum(input_tokens),\n OutputTokens = sum(output_tokens),\n TotalTokens = sum(total_tokens),\n Invocations = dcount(process_id)\n by Model = model\n| order by TotalTokens desc", - "size": 0, - "title": "Token Usage by Model", - "queryType": 0, - "resourceType": "microsoft.insights/components", - "visualization": "table", - "crossComponentResources": [ - "__APP_INSIGHTS_RESOURCE_ID__" - ], - "timeContextFromParameter": "TimeRange" - }, - "name": "tokens-by-model" - }, - { - "type": 3, - "content": { - "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend model = tostring(customDimensions['model_deployment_name'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend input_tokens = toint(customDimensions['input_tokens'])\r\n| extend output_tokens = toint(customDimensions['output_tokens'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by agent, model, process_id\r\n| summarize\r\n InputTokens = sum(input_tokens),\r\n OutputTokens = sum(output_tokens),\r\n TotalTokens = sum(total_tokens),\r\n Invocations = dcount(process_id)\r\n by Step = agent, Model = model\r\n| order by TotalTokens desc", - "size": 0, - "title": "Step-to-Model Token Mapping", - "queryType": 0, - "resourceType": "microsoft.insights/components", - "visualization": "table", - "crossComponentResources": [ - "__APP_INSIGHTS_RESOURCE_ID__" - ], - "timeContextFromParameter": "TimeRange" - }, - "name": "step-model-mapping" - }, - { - "type": 3, - "content": { - "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend total_tokens = toint(customDimensions['total_tokens'])\r\n| summarize total_tokens=max(total_tokens) by agent, process_id\r\n| summarize TotalTokens = sum(total_tokens) by process_id\r\n| join kind=leftouter (\r\n customEvents\r\n | where name == 'LLM_Token_Usage_Summary'\r\n | where timestamp > ago(7d)\r\n | extend process_id = tostring(customDimensions['process_id'])\r\n | extend file_name = tostring(customDimensions['file_name'])\r\n | summarize file_name=take_any(file_name) by process_id\r\n) on process_id\r\n| project process_id, file_name, TotalTokens\r\n| order by TotalTokens desc\r\n| take 20", - "size": 0, - "title": "Top 20 Token Consumers by Document", - "queryType": 0, - "resourceType": "microsoft.insights/components", - "visualization": "table", - "crossComponentResources": [ - "__APP_INSIGHTS_RESOURCE_ID__" - ], - "timeContextFromParameter": "TimeRange" - }, - "name": "top-consumers" - }, - { - "type": 3, - "content": { - "version": "KqlItem/1.0", - "query": "customEvents\n| where name == 'LLM_Agent_Token_Usage'\n| where timestamp > ago(7d)\n| extend process_id = tostring(customDimensions['process_id'])\n| extend agent = tostring(customDimensions['agent_name'])\n| extend input_tokens = toint(customDimensions['input_tokens'])\n| extend output_tokens = toint(customDimensions['output_tokens'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize input_tokens=max(input_tokens), output_tokens=max(output_tokens), total_tokens=max(total_tokens) by agent, process_id\n| summarize input_tokens=sum(input_tokens), output_tokens=sum(output_tokens), total_tokens=sum(total_tokens) by process_id\n| join kind=inner (\n customEvents\n | where name == 'LLM_Token_Usage_Summary'\n | where timestamp > ago(7d)\n | extend process_id = tostring(customDimensions['process_id'])\n | extend mime_type = tostring(customDimensions['file_mime_type'])\n | summarize mime_type=take_any(mime_type) by process_id\n) on process_id\n| extend file_type = case(\n mime_type has \"pdf\", \"PDF\",\n mime_type has \"image\", \"Image\",\n mime_type has \"word\" or mime_type has \"docx\", \"Word\",\n mime_type has \"excel\" or mime_type has \"xlsx\", \"Excel\",\n mime_type has \"text\", \"Text\",\n \"Other\")\n| summarize\n Documents = count(),\n TotalInputTokens = sum(input_tokens),\n TotalOutputTokens = sum(output_tokens),\n TotalTokens = sum(total_tokens),\n AvgTokensPerDoc = round(avg(total_tokens), 0)\n by FileType = file_type\n| order by TotalTokens desc", - "size": 0, - "title": "Token Usage by File Type", - "queryType": 0, - "resourceType": "microsoft.insights/components", - "visualization": "table", - "crossComponentResources": [ - "__APP_INSIGHTS_RESOURCE_ID__" - ], - "timeContextFromParameter": "TimeRange" - }, - "name": "tokens-by-filetype" - }, - { - "type": 1, - "content": { - "json": "---\n## Processing Time" - }, - "name": "section-processing-time" - }, - { - "type": 3, - "content": { - "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| join kind=inner (\r\n customEvents\r\n | where name == 'LLM_Agent_Token_Usage'\r\n | where timestamp > ago(7d)\r\n | extend process_id = tostring(customDimensions['process_id'])\r\n | summarize DocStartTime = min(timestamp) by process_id\r\n) on process_id\r\n| extend StepDurationSeconds = round(datetime_diff('millisecond', timestamp, DocStartTime) / 1000.0, 2)\r\n| summarize\r\n AvgCompletionTime = round(avg(StepDurationSeconds), 2),\r\n P50CompletionTime = round(percentile(StepDurationSeconds, 50), 2),\r\n P90CompletionTime = round(percentile(StepDurationSeconds, 90), 2),\r\n MaxCompletionTime = round(max(StepDurationSeconds), 2),\r\n Invocations = count()\r\n by Step = agent\r\n| order by AvgCompletionTime desc", - "size": 0, - "title": "Step Completion Time (seconds from doc start)", - "queryType": 0, - "resourceType": "microsoft.insights/components", - "visualization": "table", - "crossComponentResources": [ - "__APP_INSIGHTS_RESOURCE_ID__" - ], - "timeContextFromParameter": "TimeRange" - }, - "name": "processing-time-by-step" - }, - { - "type": 3, - "content": { - "version": "KqlItem/1.0", - "query": "dependencies\r\n| where timestamp > ago(7d)\r\n| where target has \"openai\" or name has \"chat\" or type == \"HTTP\" or name has \"openai\"\r\n| where success == true\r\n| extend durationSeconds = round(duration / 1000.0, 2)\r\n| summarize\r\n TotalCalls = count(),\r\n AvgSeconds = round(avg(durationSeconds), 2),\r\n P50Seconds = round(percentile(durationSeconds, 50), 2),\r\n P90Seconds = round(percentile(durationSeconds, 90), 2),\r\n MaxSeconds = round(max(durationSeconds), 2)\r\n by OperationName = name\r\n| order by TotalCalls desc\r\n| take 10", - "size": 0, - "title": "OpenAI API Call Durations", - "queryType": 0, - "resourceType": "microsoft.insights/components", - "visualization": "table", - "crossComponentResources": [ - "__APP_INSIGHTS_RESOURCE_ID__" - ], - "timeContextFromParameter": "TimeRange" - }, - "name": "openai-call-durations" - }, - { - "type": 3, - "content": { - "version": "KqlItem/1.0", - "query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| join kind=inner (\r\n customEvents\r\n | where name == 'LLM_Agent_Token_Usage'\r\n | where timestamp > ago(7d)\r\n | extend process_id = tostring(customDimensions['process_id'])\r\n | summarize DocStartTime = min(timestamp) by process_id\r\n) on process_id\r\n| extend StepCompletedAt = round(datetime_diff('millisecond', timestamp, DocStartTime) / 1000.0, 2)\r\n| project timestamp, process_id, Step=agent, StepCompletedAtSeconds=StepCompletedAt\r\n| order by process_id, timestamp asc", - "size": 0, - "title": "Per-Document Step Timeline", - "queryType": 0, - "resourceType": "microsoft.insights/components", - "visualization": "table", - "crossComponentResources": [ - "__APP_INSIGHTS_RESOURCE_ID__" - ], - "timeContextFromParameter": "TimeRange" - }, - "name": "per-document-time-breakdown" - }, - { - "type": 3, - "content": { - "version": "KqlItem/1.0", - "query": "customEvents\n| where name == 'LLM_Agent_Token_Usage'\n| where timestamp > ago(7d)\n| where customDimensions['agent_name'] == 'MapHandler'\n| extend process_id = tostring(customDimensions['process_id'])\n| summarize StartTime = min(timestamp), EndTime = max(timestamp) by process_id\n| extend TotalSeconds = round(datetime_diff('millisecond', EndTime, StartTime) / 1000.0, 2)\n| summarize\n DocumentsProcessed = dcount(process_id),\n AvgSeconds = round(avg(TotalSeconds), 2),\n P50Seconds = round(percentile(TotalSeconds, 50), 2),\n P90Seconds = round(percentile(TotalSeconds, 90), 2),\n MaxSeconds = round(max(TotalSeconds), 2)", - "size": 3, - "title": "Total Document Processing Time (First to Last Step)", - "queryType": 0, - "resourceType": "microsoft.insights/components", - "visualization": "table", - "crossComponentResources": [ - "__APP_INSIGHTS_RESOURCE_ID__" - ], - "timeContextFromParameter": "TimeRange" - }, - "name": "total-processing-time" - }, - { - "type": 1, - "content": { - "json": "---\n## Percentiles & Trends" - }, - "name": "section-percentiles" - }, - { - "type": 3, - "content": { - "version": "KqlItem/1.0", - "query": "customEvents\n| where name == 'LLM_Agent_Token_Usage'\n| where timestamp > ago(7d)\n| where customDimensions['agent_name'] == 'MapHandler'\n| extend process_id = tostring(customDimensions['process_id'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize total_tokens=max(total_tokens) by process_id\n| summarize\n p50 = round(percentile(total_tokens, 50), 0),\n p90 = round(percentile(total_tokens, 90), 0),\n p95 = round(percentile(total_tokens, 95), 0),\n p99 = round(percentile(total_tokens, 99), 0),\n Max = max(total_tokens)", - "size": 3, - "title": "Token Usage Percentiles Per Document", - "queryType": 0, - "resourceType": "microsoft.insights/components", - "visualization": "table", - "gridSettings": { - "formatters": [ - { - "columnMatch": "p50|p90|p95|p99|Max", - "formatter": 1 - } - ] - }, - "crossComponentResources": [ - "__APP_INSIGHTS_RESOURCE_ID__" - ], - "timeContextFromParameter": "TimeRange" - }, - "name": "token-percentiles" - }, - { - "type": 3, - "content": { - "version": "KqlItem/1.0", - "query": "let docs = customEvents\n| where name == 'LLM_Agent_Token_Usage'\n| where timestamp > ago(7d)\n| where customDimensions['agent_name'] == 'MapHandler'\n| extend process_id = tostring(customDimensions['process_id'])\n| distinct process_id;\ncustomEvents\n| where name == 'LLM_Agent_Token_Usage'\n| where timestamp > ago(7d)\n| extend process_id = tostring(customDimensions['process_id'])\n| extend agent = tostring(customDimensions['agent_name'])\n| extend total_tokens = toint(customDimensions['total_tokens'])\n| summarize total_tokens=max(total_tokens) by agent, process_id, bin(timestamp, 1d)\n| summarize total_tokens=sum(total_tokens) by process_id, Day=bin(timestamp, 1d)\n| summarize\n DocumentsProcessed = dcountif(process_id, process_id in (docs)),\n TotalTokens = sum(total_tokens),\n AvgTokensPerDoc = round(avg(total_tokens), 0),\n MaxTokensPerDoc = max(total_tokens)\n by Day", - "size": 0, - "title": "Daily Processing Volume with Token Usage", - "queryType": 0, - "resourceType": "microsoft.insights/components", - "visualization": "table", - "crossComponentResources": [ - "__APP_INSIGHTS_RESOURCE_ID__" - ], - "timeContextFromParameter": "TimeRange" - }, - "name": "daily-volume" - } - ], - "isLocked": true, - "defaultResourceIds": [ - "__APP_INSIGHTS_RESOURCE_ID__" - ], - "fallbackResourceIds": [ - "__APP_INSIGHTS_RESOURCE_ID__" - ], - "fromTemplateId": "community-Workbooks/Common/Templates" -} \ No newline at end of file diff --git a/src/ContentProcessor/src/libs/llm_token_telemetry.py b/src/ContentProcessor/src/libs/llm_token_telemetry.py new file mode 100644 index 00000000..7e838b5a --- /dev/null +++ b/src/ContentProcessor/src/libs/llm_token_telemetry.py @@ -0,0 +1,990 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +"""Cross-accelerator LLM token-usage telemetry helpers. + +A single, dependency-light helper module that can be dropped into any Microsoft +Solution Accelerator to capture LLM token usage and emit standardized custom +events to Application Insights. + +Why this file exists +-------------------- +Seven solution accelerators have independently shipped near-identical +``token_usage_utils.py`` modules. They all: + +* extract token counts from agent_framework / Azure OpenAI responses, +* emit the same three custom events (``LLM_Token_Usage_Summary``, + ``LLM_Agent_Token_Usage``, ``LLM_Model_Token_Usage``), +* defensively swallow telemetry errors, +* duplicate the same KQL queries and Azure Workbook. + +This module consolidates the union of those behaviours behind one stable API +so each accelerator can replace its bespoke helper with an import. + +Public API +---------- +- ``TokenUsage`` -- immutable dataclass for counts +- ``extract_usage(obj)`` -- agent_framework run result / message +- ``extract_usage_from_dict(d)`` -- raw dict from any SDK +- ``extract_usage_from_stream_chunk`` -- streaming chunks +- ``extract_realtime_usage(resp)`` -- Azure AI Voice Live response.done +- ``TokenUsageEmitter`` -- emits the three events + optional + per-user / per-team / speech events +- ``TokenUsageScope`` -- context-manager that accumulates and + auto-emits on exit +- ``track_tokens`` -- decorator wrapper around the scope + +Design rules +------------ +* Telemetry NEVER raises. Extraction failures return ``None``; emission + failures are logged at WARNING. +* No hard dependency on ``azure-monitor-events-extension``; if absent the + emitter degrades to logging only. +* Arbitrary correlation dimensions are passed as ``**dimensions`` kwargs and + surface verbatim as custom-event properties. This is how each accelerator + attaches its own keys (``conversation_id``, ``process_id``, ``team_name``, + ``file_name``, ``tenant``, etc.) without forking the helper. +""" +from __future__ import annotations + +import asyncio +import functools +import logging +import os +import random +import time +from contextlib import AbstractContextManager +from dataclasses import dataclass, field +from typing import Any, Callable, Iterable, Mapping, Optional +from unittest.mock import NonCallableMock + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Event-name constants -- keep these stable; KQL queries and workbooks bind +# to these exact strings. +# --------------------------------------------------------------------------- +EVENT_SUMMARY = "LLM_Token_Usage_Summary" +EVENT_AGENT = "LLM_Agent_Token_Usage" +EVENT_MODEL = "LLM_Model_Token_Usage" +EVENT_USER = "LLM_User_Token_Usage" +EVENT_TEAM = "LLM_Team_Token_Usage" +EVENT_SPEECH = "Speech_Usage" + + +# Token-count field aliases observed across model providers / SDK versions. +_INPUT_KEYS = ( + "input_token_count", + "input_tokens", + "prompt_tokens", + "promptTokens", +) +_OUTPUT_KEYS = ( + "output_token_count", + "output_tokens", + "completion_tokens", + "completionTokens", +) +_TOTAL_KEYS = ( + "total_token_count", + "total_tokens", + "totalTokens", +) + + +# --------------------------------------------------------------------------- +# Data model +# --------------------------------------------------------------------------- +@dataclass(frozen=True) +class TokenUsage: + """Normalized token-usage record. + + Attributes: + input_tokens: Number of input/prompt tokens consumed. + output_tokens: Number of output/completion tokens consumed. + total_tokens: Total token count (input + output). + input_audio_tokens: Audio input tokens (realtime/voice only). + input_text_tokens: Text input tokens (realtime/voice only). + input_cached_tokens: Cached input tokens (realtime/voice only). + output_audio_tokens: Audio output tokens (realtime/voice only). + output_text_tokens: Text output tokens (realtime/voice only). + """ + + input_tokens: int = 0 + output_tokens: int = 0 + total_tokens: int = 0 + + input_audio_tokens: Optional[int] = None + input_text_tokens: Optional[int] = None + input_cached_tokens: Optional[int] = None + output_audio_tokens: Optional[int] = None + output_text_tokens: Optional[int] = None + + @property + def has_any(self) -> bool: + """True if any token count is non-zero.""" + return bool(self.input_tokens or self.output_tokens or self.total_tokens) + + def __add__(self, other: "TokenUsage") -> "TokenUsage": + if not isinstance(other, TokenUsage): + return NotImplemented + + def _sum(a: Optional[int], b: Optional[int]) -> Optional[int]: + if a is None and b is None: + return None + return (a or 0) + (b or 0) + + return TokenUsage( + input_tokens=self.input_tokens + other.input_tokens, + output_tokens=self.output_tokens + other.output_tokens, + total_tokens=self.total_tokens + other.total_tokens, + input_audio_tokens=_sum(self.input_audio_tokens, other.input_audio_tokens), + input_text_tokens=_sum(self.input_text_tokens, other.input_text_tokens), + input_cached_tokens=_sum(self.input_cached_tokens, other.input_cached_tokens), + output_audio_tokens=_sum(self.output_audio_tokens, other.output_audio_tokens), + output_text_tokens=_sum(self.output_text_tokens, other.output_text_tokens), + ) + + def to_event_props(self) -> dict[str, str]: + """Stringified property bag suitable for App Insights custom events.""" + props: dict[str, str] = { + "input_tokens": str(self.input_tokens), + "output_tokens": str(self.output_tokens), + "total_tokens": str(self.total_tokens), + } + for name in ( + "input_audio_tokens", + "input_text_tokens", + "input_cached_tokens", + "output_audio_tokens", + "output_text_tokens", + ): + value = getattr(self, name) + if value is not None: + props[name] = str(value) + return props + + +# --------------------------------------------------------------------------- +# Low-level coercion helpers +# --------------------------------------------------------------------------- +def _to_int(value: Any, default: int = 0) -> int: + """Best-effort int conversion; bool excluded; never raises.""" + if value is None or isinstance(value, bool): + return default + if isinstance(value, int): + return value + if isinstance(value, float): + return int(value) + if isinstance(value, str): + s = value.strip() + if s.isdigit(): + return int(s) + try: + return int(value) + except (TypeError, ValueError): + return default + + +def _get(obj: Any, key: str, default: Any = None) -> Any: + """Read an attribute or dict key uniformly.""" + if obj is None: + return default + if isinstance(obj, Mapping): + return obj.get(key, default) + return getattr(obj, key, default) + + +def _is_iterable(obj: Any) -> bool: + """True only for real iterables (lists/tuples/sets/generators), NOT for + arbitrary objects that happen to expose ``__iter__``.""" + if obj is None: + return False + if isinstance(obj, (list, tuple, set, frozenset)): + return True + if isinstance(obj, (str, bytes, bytearray, Mapping)): + return False + if isinstance(obj, NonCallableMock): + return False + return hasattr(obj, "__iter__") + + +def _read_counts(usage_obj: Any) -> Optional[TokenUsage]: + """Read ``input/output/total`` from any usage-bearing object/dict.""" + if usage_obj is None: + return None + + inp = out = tot = 0 + for k in _INPUT_KEYS: + v = _get(usage_obj, k) + if v: + inp = _to_int(v) + break + for k in _OUTPUT_KEYS: + v = _get(usage_obj, k) + if v: + out = _to_int(v) + break + for k in _TOTAL_KEYS: + v = _get(usage_obj, k) + if v: + tot = _to_int(v) + break + + if tot == 0 and (inp or out): + tot = inp + out + if not (inp or out or tot): + return None + return TokenUsage(input_tokens=inp, output_tokens=out, total_tokens=tot) + + +# --------------------------------------------------------------------------- +# Extraction -- public +# --------------------------------------------------------------------------- +def extract_usage(result: Any) -> Optional[TokenUsage]: + """Extract usage from an agent_framework run result or ChatCompletion. + + Checks (in order): + 1. ``result.usage_details`` or ``result.usage`` + 2. ``result.raw_representation.usage`` (OpenAI ChatCompletion shape) + 3. Aggregated ``result.messages[*].contents[*].usage_details`` + + Never raises -- returns ``None`` on any unexpected shape. + """ + if result is None: + return None + + try: + for attr in ("usage_details", "usage"): + found = _read_counts(_get(result, attr)) + if found: + return found + + raw = _get(result, "raw_representation") + if raw is not None: + found = _read_counts(_get(raw, "usage")) + if found: + return found + + aggregated = TokenUsage() + found_any = False + messages = _get(result, "messages") + if not _is_iterable(messages): + return None + for msg in messages: + contents = _get(msg, "contents") + if not _is_iterable(contents): + continue + for content in contents: + usage = _get(content, "usage_details") or _get(content, "usage") + piece = _read_counts(usage) + if piece: + aggregated = aggregated + piece + found_any = True + return aggregated if found_any else None + except Exception as exc: + logger.debug("extract_usage failed: %s", exc, exc_info=True) + return None + + +def extract_usage_from_dict(data: Any) -> Optional[TokenUsage]: + """Extract from a raw dict / SDK usage object.""" + return _read_counts(data) + + +def extract_usage_from_stream_chunk(chunk: Any) -> Optional[TokenUsage]: + """Streaming chunks: try the top-level shape, then ``chunk.metadata.usage``.""" + found = extract_usage(chunk) + if found: + return found + metadata = _get(chunk, "metadata") + if metadata is not None: + return _read_counts(_get(metadata, "usage")) + return None + + +def extract_realtime_usage(response_obj: Any) -> Optional[TokenUsage]: + """Azure AI Voice Live ``response.done`` payload extractor. + + Includes audio / text / cached sub-counts when present. + """ + usage = _get(response_obj, "usage") + if usage is None: + return None + + inp = _to_int(_get(usage, "input_tokens")) + out = _to_int(_get(usage, "output_tokens")) + tot = _to_int(_get(usage, "total_tokens")) + if tot == 0 and (inp or out): + tot = inp + out + + in_details = _get(usage, "input_token_details") or {} + out_details = _get(usage, "output_token_details") or {} + + record = TokenUsage( + input_tokens=inp, + output_tokens=out, + total_tokens=tot, + input_audio_tokens=_to_int(_get(in_details, "audio_tokens")), + input_text_tokens=_to_int(_get(in_details, "text_tokens")), + input_cached_tokens=_to_int(_get(in_details, "cached_tokens")), + output_audio_tokens=_to_int(_get(out_details, "audio_tokens")), + output_text_tokens=_to_int(_get(out_details, "text_tokens")), + ) + if record.has_any or any( + v for v in ( + record.input_audio_tokens, + record.input_text_tokens, + record.input_cached_tokens, + record.output_audio_tokens, + record.output_text_tokens, + ) + ): + return record + return None + + +# --------------------------------------------------------------------------- +# Tool / sub-agent attribution +# --------------------------------------------------------------------------- +def detect_invoked_tools(result: Any) -> set[str]: + """Return the set of tool/function names invoked in an agent result. + + Used by orchestrators that expose sub-agents via ``.as_tool()`` to attribute + token usage only to the sub-agents that were actually called. Never raises. + """ + invoked: set[str] = set() + try: + messages = _get(result, "messages") + if not _is_iterable(messages): + return invoked + for msg in messages: + contents = _get(msg, "contents") + if not _is_iterable(contents): + continue + for content in contents: + if _get(content, "type") == "function_call": + name = _get(content, "name") + if name: + invoked.add(str(name)) + except Exception as exc: + logger.debug("detect_invoked_tools failed: %s", exc, exc_info=True) + return invoked + + +# --------------------------------------------------------------------------- +# Event sink (optional Application Insights dependency) +# --------------------------------------------------------------------------- +EventSink = Callable[[str, Mapping[str, str]], None] + + +def _default_event_sink() -> Optional[EventSink]: + """Return ``azure.monitor.events.extension.track_event`` if importable, + else ``None``.""" + try: + from azure.monitor.events.extension import track_event # type: ignore + except Exception: # pragma: no cover - optional dep + return None + return track_event + + +# --------------------------------------------------------------------------- +# Emitter +# --------------------------------------------------------------------------- +class TokenUsageEmitter: + """Emit standardized token-usage custom events to Application Insights. + + Responsibilities: + 1. Emit LLM_Agent_Token_Usage, LLM_Model_Token_Usage, and + LLM_Token_Usage_Summary events with consistent property schemas. + 2. Optionally sample high-cardinality events while always emitting + the summary event for accurate per-request totals. + 3. Support per-model pricing for estimated cost calculation. + 4. Hash user_id values for PII/GDPR compliance when configured. + + Attributes: + perf_slow_emit_threshold_ms: Soft threshold (ms) above which a + WARNING is logged for an individual emit call. + """ + + def __init__( + self, + *, + connection_string: Optional[str] = None, + static_dimensions: Optional[Mapping[str, Any]] = None, + event_sink: Optional[EventSink] = None, + pricing: Optional[Mapping[str, tuple[float, float]]] = None, + user_id_hasher: Optional[Callable[[str], str]] = None, + sample_rate: float = 1.0, + logger: Optional[logging.Logger] = None, + ) -> None: + self._cs = connection_string if connection_string is not None else os.getenv( + "APPLICATIONINSIGHTS_CONNECTION_STRING" + ) + self._sink = event_sink if event_sink is not None else _default_event_sink() + self._log = logger or logging.getLogger(__name__) + + self._user_id_hasher = user_id_hasher + + try: + sr = float(sample_rate) + except (TypeError, ValueError): + sr = 1.0 + self._sample_rate = max(0.0, min(1.0, sr)) + + self._pricing: dict[str, tuple[float, float]] = {} + for model, rates in (pricing or {}).items(): + if not model or rates is None: + continue + try: + inp, out = rates + self._pricing[str(model).lower()] = (float(inp), float(out)) + except (TypeError, ValueError): + self._log.warning("Ignoring malformed pricing entry: %s=%r", model, rates) + + raw_static = dict(static_dimensions or {}) + if "user_id" in raw_static: + raw_static["user_id"] = self._apply_user_id_hash(raw_static["user_id"]) + self._static: dict[str, str] = { + k: ("" if v is None else str(v)) for k, v in raw_static.items() + } + + self._perf_total_ns: int = 0 + self._perf_emit_count: int = 0 + self._perf_max_ns: int = 0 + self.perf_slow_emit_threshold_ms: float = 50.0 + + @property + def enabled(self) -> bool: + """True when App Insights connection string and event sink are available.""" + return bool(self._cs) and self._sink is not None + + @property + def sample_rate(self) -> float: + """Current sampling rate for high-cardinality events.""" + return self._sample_rate + + def _apply_user_id_hash(self, value: Any) -> Any: + """Apply the configured user_id_hasher; never raises.""" + if value is None or value == "" or self._user_id_hasher is None: + return value + try: + return self._user_id_hasher(str(value)) + except Exception as exc: + self._log.warning("user_id_hasher raised: %s", exc) + return value + + def _should_sample(self) -> bool: + """Sampling decision for high-cardinality events.""" + if self._sample_rate >= 1.0: + return True + if self._sample_rate <= 0.0: + return False + return random.random() < self._sample_rate + + def _cost_props( + self, model_deployment_name: Optional[str], usage: TokenUsage + ) -> dict[str, str]: + """Return ``{'estimated_cost_usd': '...'}`` when pricing is configured.""" + if not self._pricing or not model_deployment_name: + return {} + rate = self._pricing.get(model_deployment_name.lower()) + if not rate: + return {} + inp_rate, out_rate = rate + cost = (usage.input_tokens * inp_rate + usage.output_tokens * out_rate) / 1000.0 + return {"estimated_cost_usd": f"{cost:.6f}"} + + def _summary_cost_props( + self, + primary_model: Optional[str], + additional_agents: Mapping[str, str], + usage: TokenUsage, + ) -> dict[str, str]: + """Best-effort cost for the summary event.""" + if primary_model: + cost = self._cost_props(primary_model, usage) + if cost: + return cost + for m in additional_agents.values(): + cost = self._cost_props(m, usage) + if cost: + return cost + return {} + + def emit(self, event_name: str, **dimensions: Any) -> None: + """Low-level: emit an event with arbitrary properties. Never raises.""" + start_ns = time.perf_counter_ns() + try: + props = dict(self._static) + for k, v in dimensions.items(): + if v is None: + continue + if k == "user_id": + v = self._apply_user_id_hash(v) + if v is None or v == "": + continue + props[k] = v if isinstance(v, str) else str(v) + + if not self.enabled: + self._log.debug( + "App Insights not configured -- skipping event %s (%s)", + event_name, props, + ) + return + try: + self._sink(event_name, props) # type: ignore[misc] + except Exception as exc: + self._log.warning("track_event(%s) failed: %s", event_name, exc) + finally: + elapsed_ns = time.perf_counter_ns() - start_ns + self._perf_total_ns += elapsed_ns + self._perf_emit_count += 1 + if elapsed_ns > self._perf_max_ns: + self._perf_max_ns = elapsed_ns + elapsed_ms = elapsed_ns / 1_000_000.0 + if elapsed_ms > self.perf_slow_emit_threshold_ms: + self._log.warning( + "Token telemetry emit slow: event=%s duration_ms=%.3f", + event_name, elapsed_ms, + ) + else: + self._log.debug( + "Token telemetry emit: event=%s duration_ms=%.3f", + event_name, elapsed_ms, + ) + + def perf_stats(self) -> dict[str, float]: + """Return cumulative telemetry-overhead stats. + + Returns: + Dict with keys: emit_count, total_ms, avg_ms, max_ms. + """ + count = self._perf_emit_count + total_ms = self._perf_total_ns / 1_000_000.0 + return { + "emit_count": float(count), + "total_ms": total_ms, + "avg_ms": (total_ms / count) if count else 0.0, + "max_ms": self._perf_max_ns / 1_000_000.0, + } + + def reset_perf_stats(self) -> None: + """Zero the perf counters.""" + self._perf_total_ns = 0 + self._perf_emit_count = 0 + self._perf_max_ns = 0 + + def emit_agent( + self, + *, + agent_name: str, + model_deployment_name: str, + usage: TokenUsage, + **dimensions: Any, + ) -> None: + """Emit a per-agent token usage event.""" + if not usage.has_any or not self._should_sample(): + return + self.emit( + EVENT_AGENT, + agent_name=agent_name, + model_deployment_name=model_deployment_name, + **usage.to_event_props(), + **self._cost_props(model_deployment_name, usage), + **dimensions, + ) + + def emit_model( + self, + *, + model_deployment_name: str, + usage: TokenUsage, + **dimensions: Any, + ) -> None: + """Emit a per-model token usage event.""" + if not usage.has_any or not self._should_sample(): + return + self.emit( + EVENT_MODEL, + model_deployment_name=model_deployment_name, + **usage.to_event_props(), + **self._cost_props(model_deployment_name, usage), + **dimensions, + ) + + def emit_user( + self, + *, + user_id: str, + usage: TokenUsage, + **dimensions: Any, + ) -> None: + """Emit a per-user token usage event.""" + if not usage.has_any or not user_id or not self._should_sample(): + return + self.emit( + EVENT_USER, + user_id=user_id, + **usage.to_event_props(), + **dimensions, + ) + + def emit_team( + self, + *, + team_name: str, + usage: TokenUsage, + **dimensions: Any, + ) -> None: + """Emit a per-team token usage event.""" + if not usage.has_any or not team_name or not self._should_sample(): + return + self.emit( + EVENT_TEAM, + team_name=team_name, + **usage.to_event_props(), + **dimensions, + ) + + def emit_summary( + self, + *, + usage: TokenUsage, + agent_count: int = 1, + model_count: int = 1, + primary_model: Optional[str] = None, + additional_agents: Optional[Mapping[str, str]] = None, + **dimensions: Any, + ) -> None: + """Emit the summary event (always fires, ignores sample_rate).""" + if not usage.has_any: + return + props = { + "total_input_tokens": str(usage.input_tokens), + "total_output_tokens": str(usage.output_tokens), + "total_tokens": str(usage.total_tokens), + "agent_count": str(agent_count), + "model_count": str(model_count), + "sample_rate": f"{self._sample_rate:.4f}", + } + for k, v in usage.to_event_props().items(): + props.setdefault(k, v) + props.update(self._summary_cost_props(primary_model, additional_agents or {}, usage)) + self.emit(EVENT_SUMMARY, **props, **dimensions) + + def emit_speech( + self, + *, + model_deployment_name: str, + source: str, + usage: TokenUsage, + **dimensions: Any, + ) -> None: + """Voice-Live / realtime speech usage event.""" + if not self._should_sample(): + return + self.emit( + EVENT_SPEECH, + model_deployment_name=model_deployment_name, + source=source, + **usage.to_event_props(), + **self._cost_props(model_deployment_name, usage), + **dimensions, + ) + + def emit_all( + self, + *, + agent_name: str, + model_deployment_name: str, + usage: TokenUsage, + additional_agents: Optional[Mapping[str, str]] = None, + emit_user_event: bool = False, + emit_team_event: bool = False, + **dimensions: Any, + ) -> None: + """Emit summary, agent, and one model event per distinct model deployment. + + Args: + agent_name: Name of the primary agent/step. + model_deployment_name: Model deployment used by the primary agent. + usage: Accumulated token usage for this invocation. + additional_agents: Maps sub-agent name -> model deployment name. + emit_user_event: Opt in to per-user events. + emit_team_event: Opt in to per-team events. + **dimensions: Extra properties forwarded to all events. + """ + if not usage.has_any: + return + + agents = {agent_name: model_deployment_name} + if additional_agents: + agents.update({k: v for k, v in additional_agents.items() if k}) + models = {m for m in agents.values() if m} + + batch_start_ns = time.perf_counter_ns() + + self.emit_agent( + agent_name=agent_name, + model_deployment_name=model_deployment_name, + usage=usage, + **dimensions, + ) + for model in models: + self.emit_model( + model_deployment_name=model, + usage=usage, + **dimensions, + ) + if emit_user_event and dimensions.get("user_id"): + self.emit_user( + user_id=str(dimensions["user_id"]), + usage=usage, + agent_name=agent_name, + model_deployment_name=model_deployment_name, + ) + if emit_team_event and dimensions.get("team_name"): + self.emit_team( + team_name=str(dimensions["team_name"]), + usage=usage, + agent_name=agent_name, + model_deployment_name=model_deployment_name, + ) + + batch_overhead_ms = (time.perf_counter_ns() - batch_start_ns) / 1_000_000.0 + self.emit_summary( + usage=usage, + agent_count=len(agents), + model_count=len(models) or 1, + primary_model=model_deployment_name, + additional_agents=additional_agents, + telemetry_overhead_ms=f"{batch_overhead_ms:.3f}", + **dimensions, + ) + + self._log.info( + "[TOKEN USAGE] agent=%s model=%s input=%d output=%d total=%d %s", + agent_name, + model_deployment_name, + usage.input_tokens, + usage.output_tokens, + usage.total_tokens, + " ".join(f"{k}={v}" for k, v in dimensions.items() if v), + ) + + +# --------------------------------------------------------------------------- +# Scope / decorator sugar +# --------------------------------------------------------------------------- +@dataclass +class TokenUsageScope(AbstractContextManager): + """Accumulate usage across multiple results, then emit on exit. + + Example:: + + with TokenUsageScope(emitter, + agent_name="MapHandler", + model_deployment_name=cfg.model, + process_id=pid) as scope: + result = await agent.run(prompt) + scope.add(result) + + Attributes: + emitter: The TokenUsageEmitter instance to use for emission. + agent_name: Name of the agent/step being tracked. + model_deployment_name: Model deployment name for attribution. + dimensions: Extra properties forwarded to all events. + additional_agents: Maps sub-agent name -> model deployment name. + emit_user_event: Whether to emit per-user events. + emit_team_event: Whether to emit per-team events. + usage: Accumulated TokenUsage so far. + """ + + emitter: TokenUsageEmitter + agent_name: str + model_deployment_name: str + dimensions: dict[str, Any] = field(default_factory=dict) + additional_agents: dict[str, str] = field(default_factory=dict) + emit_user_event: bool = False + emit_team_event: bool = False + usage: TokenUsage = field(default_factory=TokenUsage) + + def __init__( + self, + emitter: TokenUsageEmitter, + *, + agent_name: str, + model_deployment_name: str, + additional_agents: Optional[Mapping[str, str]] = None, + emit_user_event: bool = False, + emit_team_event: bool = False, + **dimensions: Any, + ) -> None: + self.emitter = emitter + self.agent_name = agent_name + self.model_deployment_name = model_deployment_name + self.additional_agents = dict(additional_agents or {}) + self.emit_user_event = emit_user_event + self.emit_team_event = emit_team_event + self.dimensions = dict(dimensions) + self.usage = TokenUsage() + self._extract_ns: int = 0 + self._emit_ns: int = 0 + + def add(self, source: Any) -> Optional[TokenUsage]: + """Extract usage from any supported shape and add to the running total. + + Args: + source: Agent run result, ChatMessage, or ChatCompletion object. + + Returns: + The extracted TokenUsage, or None if extraction failed. + """ + start_ns = time.perf_counter_ns() + try: + found = extract_usage(source) or extract_usage_from_stream_chunk(source) + except Exception as exc: + logger.debug("TokenUsageScope.add failed: %s", exc, exc_info=True) + return None + finally: + self._extract_ns += time.perf_counter_ns() - start_ns + if found: + self.usage = self.usage + found + return found + + def add_usage(self, usage: TokenUsage) -> None: + """Add a pre-constructed TokenUsage to the running total.""" + self.usage = self.usage + usage + + def add_chunks(self, chunks: Iterable[Any]) -> None: + """Extract and accumulate usage from a stream of chunks.""" + for c in chunks: + self.add(c) + + @property + def extract_ms(self) -> float: + """Total ms spent inside :meth:`add` / :meth:`add_chunks`.""" + return self._extract_ns / 1_000_000.0 + + @property + def emit_ms(self) -> float: + """Total ms spent in the on-exit emit batch.""" + return self._emit_ns / 1_000_000.0 + + @property + def total_overhead_ms(self) -> float: + """Total telemetry overhead added by this scope (extract + emit).""" + return self.extract_ms + self.emit_ms + + def __exit__(self, exc_type, exc, tb) -> None: + emit_start_ns = time.perf_counter_ns() + try: + self.emitter.emit_all( + agent_name=self.agent_name, + model_deployment_name=self.model_deployment_name, + usage=self.usage, + additional_agents=self.additional_agents, + emit_user_event=self.emit_user_event, + emit_team_event=self.emit_team_event, + **self.dimensions, + ) + except Exception as emit_exc: # pragma: no cover + logger.warning("TokenUsageScope emit failed: %s", emit_exc) + finally: + self._emit_ns += time.perf_counter_ns() - emit_start_ns + logger.debug( + "TokenUsageScope overhead: agent=%s extract_ms=%.3f " + "emit_ms=%.3f total_ms=%.3f", + self.agent_name, + self.extract_ms, + self.emit_ms, + self.total_overhead_ms, + ) + return None + + +def track_tokens( + emitter: TokenUsageEmitter, + *, + agent_name: str, + model_deployment_name: str, + dimension_args: Optional[Mapping[str, str]] = None, + additional_agents: Optional[Mapping[str, str]] = None, + emit_user_event: bool = False, + emit_team_event: bool = False, +): + """Decorator: wrap an async or sync function that returns an LLM result. + + Args: + emitter: TokenUsageEmitter to use. + agent_name: Name of the agent/step. + model_deployment_name: Model deployment name. + dimension_args: Maps emitted-property-name -> callable-keyword-argument. + additional_agents: Sub-agent name -> model deployment name mapping. + emit_user_event: Opt in to per-user events. + emit_team_event: Opt in to per-team events. + """ + + dim_args = dict(dimension_args or {}) + + def _decorator(fn: Callable[..., Any]): + is_coro = _is_coroutine_function(fn) + + if is_coro: + @functools.wraps(fn) + async def _aw(*args, **kwargs) -> Any: + with _scope_for(kwargs) as scope: + result = await fn(*args, **kwargs) + scope.add(result) + return result + return _aw + + @functools.wraps(fn) + def _sw(*args, **kwargs) -> Any: + with _scope_for(kwargs) as scope: + result = fn(*args, **kwargs) + scope.add(result) + return result + return _sw + + def _scope_for(call_kwargs: Mapping[str, Any]) -> TokenUsageScope: + dimensions = { + prop: call_kwargs.get(kw) + for prop, kw in dim_args.items() + if call_kwargs.get(kw) is not None + } + return TokenUsageScope( + emitter, + agent_name=agent_name, + model_deployment_name=model_deployment_name, + additional_agents=additional_agents, + emit_user_event=emit_user_event, + emit_team_event=emit_team_event, + **dimensions, + ) + + return _decorator + + +def _is_coroutine_function(fn: Callable[..., Any]) -> bool: + return asyncio.iscoroutinefunction(fn) + + +__all__ = [ + "EVENT_SUMMARY", + "EVENT_AGENT", + "EVENT_MODEL", + "EVENT_USER", + "EVENT_TEAM", + "EVENT_SPEECH", + "TokenUsage", + "TokenUsageEmitter", + "TokenUsageScope", + "track_tokens", + "extract_usage", + "extract_usage_from_dict", + "extract_usage_from_stream_chunk", + "extract_realtime_usage", + "detect_invoked_tools", +] diff --git a/src/ContentProcessor/src/libs/pipeline/handlers/map_handler.py b/src/ContentProcessor/src/libs/pipeline/handlers/map_handler.py index 4eb964fe..3b31e7c8 100644 --- a/src/ContentProcessor/src/libs/pipeline/handlers/map_handler.py +++ b/src/ContentProcessor/src/libs/pipeline/handlers/map_handler.py @@ -28,7 +28,8 @@ from libs.pipeline.entities.pipeline_step_result import StepResult from libs.pipeline.entities.schema import Schema from libs.pipeline.queue_handler_base import HandlerBase -from libs.token_usage_utils import emit_agent_token_event, extract_token_usage +from libs.llm_token_telemetry import TokenUsageScope +from libs.telemetry import token_emitter from libs.utils.remote_schema_loader import load_schema_from_blob_json logger = logging.getLogger(__name__) @@ -265,13 +266,13 @@ async def execute(self, context: MessageContext) -> StepResult: ) # Track token usage for this LLM call - token_usage = extract_token_usage(gpt_response) - emit_agent_token_event( + with TokenUsageScope( + token_emitter, agent_name="MapHandler", model_deployment_name=self.application_context.configuration.app_azure_openai_model, - usage=token_usage, process_id=context.data_pipeline.pipeline_status.process_id, - ) + ) as scope: + scope.add(gpt_response) response_content = gpt_response.text # Json format string diff --git a/src/ContentProcessor/src/libs/pipeline/handlers/save_handler.py b/src/ContentProcessor/src/libs/pipeline/handlers/save_handler.py index ed624069..15c90f56 100644 --- a/src/ContentProcessor/src/libs/pipeline/handlers/save_handler.py +++ b/src/ContentProcessor/src/libs/pipeline/handlers/save_handler.py @@ -20,7 +20,6 @@ from libs.pipeline.entities.schema import Schema from libs.pipeline.handlers.logics.evaluate_handler.model import DataExtractionResult from libs.pipeline.queue_handler_base import HandlerBase -from libs.token_usage_utils import emit_model_token_event, emit_summary_token_event class SaveHandler(HandlerBase): @@ -169,30 +168,6 @@ def find_process_result(step_name: str): collection_name=self.application_context.configuration.app_cosmos_container_process, ) - # Emit token usage summary and per-model events to Application Insights - # NOTE: This summary only contains tokens from the evaluate/map step. - # For true totals across all pipeline steps (Summarize, RAI, GapAnalysis), - # aggregate from LLM_Agent_Token_Usage events grouped by process_id. - emit_summary_token_event( - total_input_tokens=evaluated_result.prompt_tokens, - total_output_tokens=evaluated_result.completion_tokens, - total_tokens=evaluated_result.prompt_tokens + evaluated_result.completion_tokens, - process_id=context.data_pipeline.pipeline_status.process_id, - file_name=context.data_pipeline.get_source_files()[0].name, - file_mime_type=context.data_pipeline.get_source_files()[0].mime_type, - agent_count=1, - model_count=1, - ) - emit_model_token_event( - model_deployment_name=self.application_context.configuration.app_azure_openai_model, - usage={ - "input_tokens": evaluated_result.prompt_tokens, - "output_tokens": evaluated_result.completion_tokens, - "total_tokens": evaluated_result.prompt_tokens + evaluated_result.completion_tokens, - }, - process_id=context.data_pipeline.pipeline_status.process_id, - ) - # save process_output to blob storage. processed_history = context.data_pipeline.add_file( file_name="step_outputs.json", artifact_type=ArtifactType.SavedContent diff --git a/src/ContentProcessor/src/libs/telemetry.py b/src/ContentProcessor/src/libs/telemetry.py new file mode 100644 index 00000000..0e432ed1 --- /dev/null +++ b/src/ContentProcessor/src/libs/telemetry.py @@ -0,0 +1,92 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +"""Process-wide telemetry singletons for the content-processing pipeline. + +A single :class:`TokenUsageEmitter` is constructed at import time so every +handler/executor shares the same App Insights connection-string resolution and +static dimensions. Importing this module has no side effects beyond reading +``APPLICATIONINSIGHTS_CONNECTION_STRING`` and the env vars documented below. + +Optional environment variables +------------------------------ +LLM_TOKEN_SAMPLE_RATE + Float in [0, 1]. Fraction of high-cardinality token events + (agent/model/user/team/speech) to ship. The summary event always fires. + Defaults to ``1.0``. + +LLM_TOKEN_USER_ID_HMAC_KEY + When set, ``user_id`` values are replaced with an HMAC-SHA256 hex digest + (truncated to 16 chars) before leaving the process. Use to satisfy + GDPR / PII handling requirements without modifying call sites. + +LLM_TOKEN_PRICING + Optional comma-separated list of ``model=in_per_1k:out_per_1k`` entries, + e.g. ``gpt-4o=0.0025:0.01,gpt-4o-mini=0.00015:0.0006``. When set the + emitter attaches ``estimated_cost_usd`` to agent / model / summary + events so dashboards can group by cost without hard-coded KQL rates. +""" +from __future__ import annotations + +import hashlib +import hmac +import logging +import os +from typing import Callable, Optional + +from libs.llm_token_telemetry import TokenUsageEmitter + +_log = logging.getLogger(__name__) + + +def _parse_sample_rate() -> float: + raw = os.getenv("LLM_TOKEN_SAMPLE_RATE") + if not raw: + return 1.0 + try: + return max(0.0, min(1.0, float(raw))) + except ValueError: + _log.warning("Invalid LLM_TOKEN_SAMPLE_RATE=%r; defaulting to 1.0", raw) + return 1.0 + + +def _build_user_id_hasher() -> Optional[Callable[[str], str]]: + key = os.getenv("LLM_TOKEN_USER_ID_HMAC_KEY") + if not key: + return None + key_bytes = key.encode("utf-8") + + def _hash(value: str) -> str: + digest = hmac.new(key_bytes, value.encode("utf-8"), hashlib.sha256).hexdigest() + return digest[:16] + + return _hash + + +def _parse_pricing() -> dict[str, tuple[float, float]]: + raw = os.getenv("LLM_TOKEN_PRICING") + if not raw: + return {} + pricing: dict[str, tuple[float, float]] = {} + for entry in raw.split(","): + entry = entry.strip() + if not entry or "=" not in entry: + continue + model, rates = entry.split("=", 1) + if ":" not in rates: + continue + in_s, out_s = rates.split(":", 1) + try: + pricing[model.strip().lower()] = (float(in_s), float(out_s)) + except ValueError: + _log.warning("Ignoring malformed pricing entry: %s", entry) + return pricing + + +token_emitter = TokenUsageEmitter( + static_dimensions={"app": "content-processing"}, + sample_rate=_parse_sample_rate(), + user_id_hasher=_build_user_id_hasher(), + pricing=_parse_pricing(), +) + +__all__ = ["token_emitter"] diff --git a/src/ContentProcessor/src/libs/token_usage_utils.py b/src/ContentProcessor/src/libs/token_usage_utils.py deleted file mode 100644 index b88c5cd5..00000000 --- a/src/ContentProcessor/src/libs/token_usage_utils.py +++ /dev/null @@ -1,245 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -"""Token usage tracking for LLM calls in the content processing pipeline. - -Extracts token counts from Azure OpenAI agent framework responses and emits -custom events to Application Insights for monitoring, cost estimation, and -performance optimization. -""" - -import logging -import os -from typing import Any - -logger = logging.getLogger(__name__) - - -def _track_event_if_configured(event_name: str, event_data: dict) -> None: - """Track a custom event to Application Insights if configured. - - Args: - event_name: Name of the custom event. - event_data: Dictionary of event properties (all values must be strings). - """ - connection_string = os.getenv("APPLICATIONINSIGHTS_CONNECTION_STRING") - if connection_string: - try: - from azure.monitor.events.extension import track_event - - track_event(event_name, event_data) - except Exception as exc: - logger.warning("Failed to track event '%s': %s", event_name, exc) - else: - logger.debug( - "Skipping track_event for %s: Application Insights is not configured", - event_name, - ) - - -def extract_token_usage(response: Any) -> dict[str, int]: - """Extract token usage from an agent framework ChatMessage response. - - Checks multiple attribute paths to handle different response shapes - from the agent framework SDK. - - Args: - response: The ChatMessage response object from agent.run(). - - Returns: - Dict with keys: input_tokens, output_tokens, total_tokens. - All default to 0 if not found. - """ - input_tokens = 0 - output_tokens = 0 - total_tokens = 0 - - # Path 1: usage_details attribute (set by agent framework SDK) - usage_details = getattr(response, "usage_details", None) - if usage_details is not None: - if isinstance(usage_details, dict): - input_tokens = _to_int( - usage_details.get("input_token_count") - or usage_details.get("prompt_tokens") - or usage_details.get("input_tokens") - ) - output_tokens = _to_int( - usage_details.get("output_token_count") - or usage_details.get("completion_tokens") - or usage_details.get("output_tokens") - ) - total_tokens = _to_int( - usage_details.get("total_token_count") - or usage_details.get("total_tokens") - ) or (input_tokens + output_tokens) - else: - # UsageDetails object with attributes - input_tokens = _to_int( - getattr(usage_details, "input_token_count", 0) - or getattr(usage_details, "prompt_tokens", 0) - ) - output_tokens = _to_int( - getattr(usage_details, "output_token_count", 0) - or getattr(usage_details, "completion_tokens", 0) - ) - total_tokens = _to_int( - getattr(usage_details, "total_token_count", 0) - ) or (input_tokens + output_tokens) - - # Path 2: raw_representation.usage (raw Azure OpenAI response) - if total_tokens == 0: - raw = getattr(response, "raw_representation", None) - if raw is not None: - usage_obj = getattr(raw, "usage", None) - if usage_obj is not None: - if isinstance(usage_obj, dict): - input_tokens = _to_int( - usage_obj.get("prompt_tokens") - or usage_obj.get("input_tokens") - ) - output_tokens = _to_int( - usage_obj.get("completion_tokens") - or usage_obj.get("output_tokens") - ) - total_tokens = _to_int( - usage_obj.get("total_tokens") - ) or (input_tokens + output_tokens) - else: - input_tokens = _to_int( - getattr(usage_obj, "prompt_tokens", 0) - or getattr(usage_obj, "input_tokens", 0) - ) - output_tokens = _to_int( - getattr(usage_obj, "completion_tokens", 0) - or getattr(usage_obj, "output_tokens", 0) - ) - total_tokens = _to_int( - getattr(usage_obj, "total_tokens", 0) - ) or (input_tokens + output_tokens) - - return { - "input_tokens": input_tokens, - "output_tokens": output_tokens, - "total_tokens": total_tokens, - } - - -def emit_agent_token_event( - agent_name: str, - model_deployment_name: str, - usage: dict[str, int], - process_id: str = "", -) -> None: - """Emit a per-agent token usage event to Application Insights. - - Args: - agent_name: Name of the pipeline step/agent (e.g. 'MapHandler', 'RAI'). - model_deployment_name: Azure OpenAI model deployment name. - usage: Dict with input_tokens, output_tokens, total_tokens. - process_id: Document processing ID for correlation. - """ - _track_event_if_configured("LLM_Agent_Token_Usage", { - "agent_name": agent_name, - "input_tokens": str(usage.get("input_tokens", 0)), - "output_tokens": str(usage.get("output_tokens", 0)), - "total_tokens": str(usage.get("total_tokens", 0)), - "model_deployment_name": model_deployment_name, - "process_id": process_id, - }) - logger.info( - "[TOKEN USAGE] agent=%s model=%s input=%d output=%d total=%d process=%s", - agent_name, - model_deployment_name, - usage.get("input_tokens", 0), - usage.get("output_tokens", 0), - usage.get("total_tokens", 0), - process_id, - ) - - -def emit_model_token_event( - model_deployment_name: str, - usage: dict[str, int], - process_id: str = "", -) -> None: - """Emit a per-model token usage event to Application Insights. - - Args: - model_deployment_name: Azure OpenAI model deployment name. - usage: Dict with input_tokens, output_tokens, total_tokens. - process_id: Document processing ID for correlation. - """ - _track_event_if_configured("LLM_Model_Token_Usage", { - "model_deployment_name": model_deployment_name, - "input_tokens": str(usage.get("input_tokens", 0)), - "output_tokens": str(usage.get("output_tokens", 0)), - "total_tokens": str(usage.get("total_tokens", 0)), - "process_id": process_id, - }) - - -def emit_summary_token_event( - total_input_tokens: int, - total_output_tokens: int, - total_tokens: int, - process_id: str = "", - file_name: str = "", - file_mime_type: str = "", - agent_count: int = 0, - model_count: int = 0, -) -> None: - """Emit a summary token usage event for a complete document processing run. - - Args: - total_input_tokens: Sum of all input tokens across all steps. - total_output_tokens: Sum of all output tokens across all steps. - total_tokens: Sum of all tokens across all steps. - process_id: Document processing ID. - file_name: Name of the processed file. - file_mime_type: MIME type of the processed file. - agent_count: Number of agents/steps that used tokens. - model_count: Number of distinct models used. - """ - _track_event_if_configured("LLM_Token_Usage_Summary", { - "total_input_tokens": str(total_input_tokens), - "total_output_tokens": str(total_output_tokens), - "total_tokens": str(total_tokens), - "process_id": process_id, - "file_name": file_name, - "file_mime_type": file_mime_type, - "agent_count": str(agent_count), - "model_count": str(model_count), - }) - logger.info( - "[TOKEN SUMMARY] process=%s file=%s input=%d output=%d total=%d agents=%d models=%d", - process_id, - file_name, - total_input_tokens, - total_output_tokens, - total_tokens, - agent_count, - model_count, - ) - - -def _to_int(val: object, default: int = 0) -> int: - """Safely convert a value to int. - - Args: - val: Value to convert. - default: Default if conversion fails. - - Returns: - Integer value or default. - """ - if val is None or isinstance(val, bool): - return default - if isinstance(val, int): - return val - if isinstance(val, float): - return int(val) - if isinstance(val, str): - s = val.strip() - if s.isdigit(): - return int(s) - return default diff --git a/src/ContentProcessor/tests/unit/libs/test_token_usage_utils.py b/src/ContentProcessor/tests/unit/libs/test_token_usage_utils.py index 0454e411..5e9c1b83 100644 --- a/src/ContentProcessor/tests/unit/libs/test_token_usage_utils.py +++ b/src/ContentProcessor/tests/unit/libs/test_token_usage_utils.py @@ -1,18 +1,20 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -"""Tests for libs.token_usage_utils (token usage extraction and event emission).""" +"""Tests for libs.llm_token_telemetry (standardized token usage telemetry).""" from __future__ import annotations from unittest.mock import MagicMock, patch -from libs.token_usage_utils import ( +from libs.llm_token_telemetry import ( + TokenUsage, + TokenUsageEmitter, + TokenUsageScope, _to_int, - emit_agent_token_event, - emit_model_token_event, - emit_summary_token_event, - extract_token_usage, + extract_usage, + extract_usage_from_dict, + detect_invoked_tools, ) @@ -45,10 +47,46 @@ def test_custom_default(self): assert _to_int(None, default=5) == 5 -# ── extract_token_usage ──────────────────────────────────────────────── +# ── TokenUsage dataclass ────────────────────────────────────────────── -class TestExtractTokenUsage: +class TestTokenUsage: + """Immutable token-usage record with addition support.""" + + def test_defaults_to_zero(self): + usage = TokenUsage() + assert usage.input_tokens == 0 + assert usage.output_tokens == 0 + assert usage.total_tokens == 0 + assert not usage.has_any + + def test_has_any_true_when_nonzero(self): + assert TokenUsage(input_tokens=1).has_any + assert TokenUsage(output_tokens=1).has_any + assert TokenUsage(total_tokens=1).has_any + + def test_addition(self): + a = TokenUsage(input_tokens=100, output_tokens=50, total_tokens=150) + b = TokenUsage(input_tokens=200, output_tokens=80, total_tokens=280) + result = a + b + assert result.input_tokens == 300 + assert result.output_tokens == 130 + assert result.total_tokens == 430 + + def test_to_event_props(self): + usage = TokenUsage(input_tokens=10, output_tokens=5, total_tokens=15) + props = usage.to_event_props() + assert props == { + "input_tokens": "10", + "output_tokens": "5", + "total_tokens": "15", + } + + +# ── extract_usage ────────────────────────────────────────────────────── + + +class TestExtractUsage: """Token extraction from various response shapes.""" def test_usage_details_dict_with_standard_keys(self): @@ -58,12 +96,8 @@ def test_usage_details_dict_with_standard_keys(self): "output_token_count": 50, "total_token_count": 150, } - result = extract_token_usage(response) - assert result == { - "input_tokens": 100, - "output_tokens": 50, - "total_tokens": 150, - } + result = extract_usage(response) + assert result == TokenUsage(input_tokens=100, output_tokens=50, total_tokens=150) def test_usage_details_dict_with_openai_keys(self): response = MagicMock() @@ -72,44 +106,40 @@ def test_usage_details_dict_with_openai_keys(self): "completion_tokens": 80, "total_tokens": 280, } - result = extract_token_usage(response) - assert result == { - "input_tokens": 200, - "output_tokens": 80, - "total_tokens": 280, - } + result = extract_usage(response) + assert result == TokenUsage(input_tokens=200, output_tokens=80, total_tokens=280) def test_usage_details_none_falls_to_raw_representation(self): response = MagicMock() response.usage_details = None + response.usage = None usage_obj = MagicMock() usage_obj.prompt_tokens = 300 usage_obj.completion_tokens = 120 usage_obj.total_tokens = 420 usage_obj.input_tokens = 0 usage_obj.output_tokens = 0 + usage_obj.input_token_count = 0 + usage_obj.output_token_count = 0 + usage_obj.total_token_count = 0 + usage_obj.promptTokens = 0 + usage_obj.completionTokens = 0 + usage_obj.totalTokens = 0 response.raw_representation.usage = usage_obj - result = extract_token_usage(response) - assert result == { - "input_tokens": 300, - "output_tokens": 120, - "total_tokens": 420, - } + result = extract_usage(response) + assert result == TokenUsage(input_tokens=300, output_tokens=120, total_tokens=420) def test_raw_representation_dict_usage(self): response = MagicMock() response.usage_details = None + response.usage = None response.raw_representation.usage = { "prompt_tokens": 50, "completion_tokens": 25, "total_tokens": 75, } - result = extract_token_usage(response) - assert result == { - "input_tokens": 50, - "output_tokens": 25, - "total_tokens": 75, - } + result = extract_usage(response) + assert result == TokenUsage(input_tokens=50, output_tokens=25, total_tokens=75) def test_usage_details_object_with_attributes(self): """Handle UsageDetails object (not dict) from agent framework.""" @@ -119,23 +149,20 @@ def test_usage_details_object_with_attributes(self): usage_obj.output_token_count = 150 usage_obj.total_token_count = 550 response.usage_details = usage_obj - result = extract_token_usage(response) - assert result == { - "input_tokens": 400, - "output_tokens": 150, - "total_tokens": 550, - } + result = extract_usage(response) + assert result == TokenUsage(input_tokens=400, output_tokens=150, total_tokens=550) + + def test_none_returns_none(self): + assert extract_usage(None) is None - def test_no_usage_returns_zeros(self): + def test_no_usage_returns_none(self): response = MagicMock() response.usage_details = None + response.usage = None response.raw_representation = None - result = extract_token_usage(response) - assert result == { - "input_tokens": 0, - "output_tokens": 0, - "total_tokens": 0, - } + response.messages = None + result = extract_usage(response) + assert result is None def test_total_computed_from_input_output_when_missing(self): response = MagicMock() @@ -143,104 +170,203 @@ def test_total_computed_from_input_output_when_missing(self): "input_token_count": 100, "output_token_count": 50, } - result = extract_token_usage(response) - assert result["total_tokens"] == 150 + result = extract_usage(response) + assert result.total_tokens == 150 -# ── emit_agent_token_event ───────────────────────────────────────────── +# ── extract_usage_from_dict ─────────────────────────────────────────── -class TestEmitAgentTokenEvent: - """Custom event emission for per-agent token usage.""" +class TestExtractUsageFromDict: + """Extraction from raw dict / SDK usage objects.""" - @patch("libs.token_usage_utils._track_event_if_configured") - def test_emits_correct_event(self, mock_track): - usage = {"input_tokens": 100, "output_tokens": 50, "total_tokens": 150} - emit_agent_token_event( - agent_name="MapHandler", - model_deployment_name="gpt-4o", - usage=usage, - process_id="proc-123", - ) - mock_track.assert_called_once_with("LLM_Agent_Token_Usage", { - "agent_name": "MapHandler", - "input_tokens": "100", - "output_tokens": "50", - "total_tokens": "150", - "model_deployment_name": "gpt-4o", - "process_id": "proc-123", + def test_dict_with_standard_keys(self): + result = extract_usage_from_dict({ + "input_tokens": 100, + "output_tokens": 50, + "total_tokens": 150, }) + assert result == TokenUsage(input_tokens=100, output_tokens=50, total_tokens=150) + def test_none_returns_none(self): + assert extract_usage_from_dict(None) is None -# ── emit_model_token_event ───────────────────────────────────────────── +# ── detect_invoked_tools ────────────────────────────────────────────── -class TestEmitModelTokenEvent: - """Custom event emission for per-model token usage.""" - @patch("libs.token_usage_utils._track_event_if_configured") - def test_emits_correct_event(self, mock_track): - usage = {"input_tokens": 200, "output_tokens": 80, "total_tokens": 280} - emit_model_token_event( - model_deployment_name="gpt-4o", - usage=usage, - process_id="proc-456", - ) - mock_track.assert_called_once_with("LLM_Model_Token_Usage", { - "model_deployment_name": "gpt-4o", - "input_tokens": "200", - "output_tokens": "80", - "total_tokens": "280", - "process_id": "proc-456", - }) +class TestDetectInvokedTools: + """Tool detection from agent result messages.""" + + def test_detects_function_calls(self): + content1 = MagicMock() + content1.type = "function_call" + content1.name = "product_agent" + content2 = MagicMock() + content2.type = "text" + content2.name = None + msg = MagicMock() + msg.contents = [content1, content2] + result_obj = MagicMock() + result_obj.messages = [msg] + invoked = detect_invoked_tools(result_obj) + assert invoked == {"product_agent"} + def test_returns_empty_for_none(self): + assert detect_invoked_tools(None) == set() -# ── emit_summary_token_event ────────────────────────────────────────── +# ── TokenUsageEmitter ───────────────────────────────────────────────── -class TestEmitSummaryTokenEvent: - """Custom event emission for document-level token summary.""" - @patch("libs.token_usage_utils._track_event_if_configured") - def test_emits_correct_event(self, mock_track): - emit_summary_token_event( - total_input_tokens=500, - total_output_tokens=200, - total_tokens=700, - process_id="proc-789", - file_name="test.pdf", - file_mime_type="application/pdf", - agent_count=2, - model_count=1, +class TestTokenUsageEmitter: + """Custom event emission via the standardized emitter.""" + + def test_emit_agent_calls_sink(self): + sink = MagicMock() + emitter = TokenUsageEmitter( + connection_string="test", + event_sink=sink, + static_dimensions={"app": "content-processing"}, ) - mock_track.assert_called_once_with("LLM_Token_Usage_Summary", { - "total_input_tokens": "500", - "total_output_tokens": "200", - "total_tokens": "700", - "process_id": "proc-789", - "file_name": "test.pdf", - "file_mime_type": "application/pdf", - "agent_count": "2", - "model_count": "1", - }) + usage = TokenUsage(input_tokens=100, output_tokens=50, total_tokens=150) + emitter.emit_agent( + agent_name="MapHandler", + model_deployment_name="gpt-4o", + usage=usage, + process_id="proc-123", + ) + sink.assert_called_once() + call_args = sink.call_args + assert call_args[0][0] == "LLM_Agent_Token_Usage" + props = call_args[0][1] + assert props["agent_name"] == "MapHandler" + assert props["input_tokens"] == "100" + assert props["app"] == "content-processing" + + def test_emit_all_emits_agent_model_summary(self): + sink = MagicMock() + emitter = TokenUsageEmitter( + connection_string="test", + event_sink=sink, + static_dimensions={"app": "content-processing"}, + ) + usage = TokenUsage(input_tokens=200, output_tokens=80, total_tokens=280) + emitter.emit_all( + agent_name="RAI", + model_deployment_name="gpt-4o", + usage=usage, + process_id="proc-456", + ) + event_names = [call[0][0] for call in sink.call_args_list] + assert "LLM_Agent_Token_Usage" in event_names + assert "LLM_Model_Token_Usage" in event_names + assert "LLM_Token_Usage_Summary" in event_names + + def test_emit_all_agent_count_correct(self): + sink = MagicMock() + emitter = TokenUsageEmitter( + connection_string="test", + event_sink=sink, + ) + usage = TokenUsage(input_tokens=100, output_tokens=50, total_tokens=150) + emitter.emit_all( + agent_name="MapHandler", + model_deployment_name="gpt-4o", + usage=usage, + ) + # Find the summary event call + summary_call = next( + call for call in sink.call_args_list + if call[0][0] == "LLM_Token_Usage_Summary" + ) + props = summary_call[0][1] + assert props["agent_count"] == "1" + assert props["model_count"] == "1" + + def test_emit_skips_when_not_configured(self): + emitter = TokenUsageEmitter(connection_string=None, event_sink=None) + assert not emitter.enabled + # Should not raise + emitter.emit("test_event", key="value") + def test_perf_stats(self): + sink = MagicMock() + emitter = TokenUsageEmitter(connection_string="test", event_sink=sink) + emitter.emit("test_event") + stats = emitter.perf_stats() + assert stats["emit_count"] == 1.0 + assert stats["total_ms"] >= 0 -# ── _track_event_if_configured ──────────────────────────────────────── +# ── TokenUsageScope ────────────────────────────────────────────────── -class TestTrackEventIfConfigured: - """Application Insights event tracking guard.""" - @patch.dict("os.environ", {"APPLICATIONINSIGHTS_CONNECTION_STRING": "InstrumentationKey=test"}) - @patch("azure.monitor.events.extension.track_event") - def test_tracks_when_configured(self, mock_track_event): - from libs.token_usage_utils import _track_event_if_configured +class TestTokenUsageScope: + """Context manager that accumulates usage and emits on exit.""" - _track_event_if_configured("test_event", {"key": "value"}) - mock_track_event.assert_called_once_with("test_event", {"key": "value"}) + def test_scope_emits_on_exit(self): + sink = MagicMock() + emitter = TokenUsageEmitter( + connection_string="test", + event_sink=sink, + static_dimensions={"app": "content-processing"}, + ) + response = MagicMock() + response.usage_details = { + "input_token_count": 100, + "output_token_count": 50, + "total_token_count": 150, + } + with TokenUsageScope( + emitter, + agent_name="MapHandler", + model_deployment_name="gpt-4o", + process_id="proc-123", + ) as scope: + scope.add(response) + + assert scope.usage.input_tokens == 100 + assert scope.usage.output_tokens == 50 + event_names = [call[0][0] for call in sink.call_args_list] + assert "LLM_Agent_Token_Usage" in event_names + assert "LLM_Token_Usage_Summary" in event_names + + def test_scope_handles_no_usage(self): + sink = MagicMock() + emitter = TokenUsageEmitter(connection_string="test", event_sink=sink) + response = MagicMock() + response.usage_details = None + response.usage = None + response.raw_representation = None + response.messages = None + with TokenUsageScope( + emitter, + agent_name="Test", + model_deployment_name="gpt-4o", + ) as scope: + scope.add(response) + + assert not scope.usage.has_any + # No events should fire for zero usage + sink.assert_not_called() + + def test_scope_accumulates_multiple_adds(self): + sink = MagicMock() + emitter = TokenUsageEmitter(connection_string="test", event_sink=sink) + r1 = MagicMock() + r1.usage_details = {"input_token_count": 100, "output_token_count": 50, "total_token_count": 150} + r2 = MagicMock() + r2.usage_details = {"input_token_count": 200, "output_token_count": 80, "total_token_count": 280} + with TokenUsageScope( + emitter, + agent_name="Test", + model_deployment_name="gpt-4o", + ) as scope: + scope.add(r1) + scope.add(r2) - @patch.dict("os.environ", {}, clear=True) - def test_skips_when_not_configured(self): - from libs.token_usage_utils import _track_event_if_configured + assert scope.usage.input_tokens == 300 + assert scope.usage.output_tokens == 130 + assert scope.usage.total_tokens == 430 - _track_event_if_configured("test_event", {"key": "value"}) diff --git a/src/ContentProcessorWorkflow/src/libs/llm_token_telemetry.py b/src/ContentProcessorWorkflow/src/libs/llm_token_telemetry.py new file mode 100644 index 00000000..7e838b5a --- /dev/null +++ b/src/ContentProcessorWorkflow/src/libs/llm_token_telemetry.py @@ -0,0 +1,990 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +"""Cross-accelerator LLM token-usage telemetry helpers. + +A single, dependency-light helper module that can be dropped into any Microsoft +Solution Accelerator to capture LLM token usage and emit standardized custom +events to Application Insights. + +Why this file exists +-------------------- +Seven solution accelerators have independently shipped near-identical +``token_usage_utils.py`` modules. They all: + +* extract token counts from agent_framework / Azure OpenAI responses, +* emit the same three custom events (``LLM_Token_Usage_Summary``, + ``LLM_Agent_Token_Usage``, ``LLM_Model_Token_Usage``), +* defensively swallow telemetry errors, +* duplicate the same KQL queries and Azure Workbook. + +This module consolidates the union of those behaviours behind one stable API +so each accelerator can replace its bespoke helper with an import. + +Public API +---------- +- ``TokenUsage`` -- immutable dataclass for counts +- ``extract_usage(obj)`` -- agent_framework run result / message +- ``extract_usage_from_dict(d)`` -- raw dict from any SDK +- ``extract_usage_from_stream_chunk`` -- streaming chunks +- ``extract_realtime_usage(resp)`` -- Azure AI Voice Live response.done +- ``TokenUsageEmitter`` -- emits the three events + optional + per-user / per-team / speech events +- ``TokenUsageScope`` -- context-manager that accumulates and + auto-emits on exit +- ``track_tokens`` -- decorator wrapper around the scope + +Design rules +------------ +* Telemetry NEVER raises. Extraction failures return ``None``; emission + failures are logged at WARNING. +* No hard dependency on ``azure-monitor-events-extension``; if absent the + emitter degrades to logging only. +* Arbitrary correlation dimensions are passed as ``**dimensions`` kwargs and + surface verbatim as custom-event properties. This is how each accelerator + attaches its own keys (``conversation_id``, ``process_id``, ``team_name``, + ``file_name``, ``tenant``, etc.) without forking the helper. +""" +from __future__ import annotations + +import asyncio +import functools +import logging +import os +import random +import time +from contextlib import AbstractContextManager +from dataclasses import dataclass, field +from typing import Any, Callable, Iterable, Mapping, Optional +from unittest.mock import NonCallableMock + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Event-name constants -- keep these stable; KQL queries and workbooks bind +# to these exact strings. +# --------------------------------------------------------------------------- +EVENT_SUMMARY = "LLM_Token_Usage_Summary" +EVENT_AGENT = "LLM_Agent_Token_Usage" +EVENT_MODEL = "LLM_Model_Token_Usage" +EVENT_USER = "LLM_User_Token_Usage" +EVENT_TEAM = "LLM_Team_Token_Usage" +EVENT_SPEECH = "Speech_Usage" + + +# Token-count field aliases observed across model providers / SDK versions. +_INPUT_KEYS = ( + "input_token_count", + "input_tokens", + "prompt_tokens", + "promptTokens", +) +_OUTPUT_KEYS = ( + "output_token_count", + "output_tokens", + "completion_tokens", + "completionTokens", +) +_TOTAL_KEYS = ( + "total_token_count", + "total_tokens", + "totalTokens", +) + + +# --------------------------------------------------------------------------- +# Data model +# --------------------------------------------------------------------------- +@dataclass(frozen=True) +class TokenUsage: + """Normalized token-usage record. + + Attributes: + input_tokens: Number of input/prompt tokens consumed. + output_tokens: Number of output/completion tokens consumed. + total_tokens: Total token count (input + output). + input_audio_tokens: Audio input tokens (realtime/voice only). + input_text_tokens: Text input tokens (realtime/voice only). + input_cached_tokens: Cached input tokens (realtime/voice only). + output_audio_tokens: Audio output tokens (realtime/voice only). + output_text_tokens: Text output tokens (realtime/voice only). + """ + + input_tokens: int = 0 + output_tokens: int = 0 + total_tokens: int = 0 + + input_audio_tokens: Optional[int] = None + input_text_tokens: Optional[int] = None + input_cached_tokens: Optional[int] = None + output_audio_tokens: Optional[int] = None + output_text_tokens: Optional[int] = None + + @property + def has_any(self) -> bool: + """True if any token count is non-zero.""" + return bool(self.input_tokens or self.output_tokens or self.total_tokens) + + def __add__(self, other: "TokenUsage") -> "TokenUsage": + if not isinstance(other, TokenUsage): + return NotImplemented + + def _sum(a: Optional[int], b: Optional[int]) -> Optional[int]: + if a is None and b is None: + return None + return (a or 0) + (b or 0) + + return TokenUsage( + input_tokens=self.input_tokens + other.input_tokens, + output_tokens=self.output_tokens + other.output_tokens, + total_tokens=self.total_tokens + other.total_tokens, + input_audio_tokens=_sum(self.input_audio_tokens, other.input_audio_tokens), + input_text_tokens=_sum(self.input_text_tokens, other.input_text_tokens), + input_cached_tokens=_sum(self.input_cached_tokens, other.input_cached_tokens), + output_audio_tokens=_sum(self.output_audio_tokens, other.output_audio_tokens), + output_text_tokens=_sum(self.output_text_tokens, other.output_text_tokens), + ) + + def to_event_props(self) -> dict[str, str]: + """Stringified property bag suitable for App Insights custom events.""" + props: dict[str, str] = { + "input_tokens": str(self.input_tokens), + "output_tokens": str(self.output_tokens), + "total_tokens": str(self.total_tokens), + } + for name in ( + "input_audio_tokens", + "input_text_tokens", + "input_cached_tokens", + "output_audio_tokens", + "output_text_tokens", + ): + value = getattr(self, name) + if value is not None: + props[name] = str(value) + return props + + +# --------------------------------------------------------------------------- +# Low-level coercion helpers +# --------------------------------------------------------------------------- +def _to_int(value: Any, default: int = 0) -> int: + """Best-effort int conversion; bool excluded; never raises.""" + if value is None or isinstance(value, bool): + return default + if isinstance(value, int): + return value + if isinstance(value, float): + return int(value) + if isinstance(value, str): + s = value.strip() + if s.isdigit(): + return int(s) + try: + return int(value) + except (TypeError, ValueError): + return default + + +def _get(obj: Any, key: str, default: Any = None) -> Any: + """Read an attribute or dict key uniformly.""" + if obj is None: + return default + if isinstance(obj, Mapping): + return obj.get(key, default) + return getattr(obj, key, default) + + +def _is_iterable(obj: Any) -> bool: + """True only for real iterables (lists/tuples/sets/generators), NOT for + arbitrary objects that happen to expose ``__iter__``.""" + if obj is None: + return False + if isinstance(obj, (list, tuple, set, frozenset)): + return True + if isinstance(obj, (str, bytes, bytearray, Mapping)): + return False + if isinstance(obj, NonCallableMock): + return False + return hasattr(obj, "__iter__") + + +def _read_counts(usage_obj: Any) -> Optional[TokenUsage]: + """Read ``input/output/total`` from any usage-bearing object/dict.""" + if usage_obj is None: + return None + + inp = out = tot = 0 + for k in _INPUT_KEYS: + v = _get(usage_obj, k) + if v: + inp = _to_int(v) + break + for k in _OUTPUT_KEYS: + v = _get(usage_obj, k) + if v: + out = _to_int(v) + break + for k in _TOTAL_KEYS: + v = _get(usage_obj, k) + if v: + tot = _to_int(v) + break + + if tot == 0 and (inp or out): + tot = inp + out + if not (inp or out or tot): + return None + return TokenUsage(input_tokens=inp, output_tokens=out, total_tokens=tot) + + +# --------------------------------------------------------------------------- +# Extraction -- public +# --------------------------------------------------------------------------- +def extract_usage(result: Any) -> Optional[TokenUsage]: + """Extract usage from an agent_framework run result or ChatCompletion. + + Checks (in order): + 1. ``result.usage_details`` or ``result.usage`` + 2. ``result.raw_representation.usage`` (OpenAI ChatCompletion shape) + 3. Aggregated ``result.messages[*].contents[*].usage_details`` + + Never raises -- returns ``None`` on any unexpected shape. + """ + if result is None: + return None + + try: + for attr in ("usage_details", "usage"): + found = _read_counts(_get(result, attr)) + if found: + return found + + raw = _get(result, "raw_representation") + if raw is not None: + found = _read_counts(_get(raw, "usage")) + if found: + return found + + aggregated = TokenUsage() + found_any = False + messages = _get(result, "messages") + if not _is_iterable(messages): + return None + for msg in messages: + contents = _get(msg, "contents") + if not _is_iterable(contents): + continue + for content in contents: + usage = _get(content, "usage_details") or _get(content, "usage") + piece = _read_counts(usage) + if piece: + aggregated = aggregated + piece + found_any = True + return aggregated if found_any else None + except Exception as exc: + logger.debug("extract_usage failed: %s", exc, exc_info=True) + return None + + +def extract_usage_from_dict(data: Any) -> Optional[TokenUsage]: + """Extract from a raw dict / SDK usage object.""" + return _read_counts(data) + + +def extract_usage_from_stream_chunk(chunk: Any) -> Optional[TokenUsage]: + """Streaming chunks: try the top-level shape, then ``chunk.metadata.usage``.""" + found = extract_usage(chunk) + if found: + return found + metadata = _get(chunk, "metadata") + if metadata is not None: + return _read_counts(_get(metadata, "usage")) + return None + + +def extract_realtime_usage(response_obj: Any) -> Optional[TokenUsage]: + """Azure AI Voice Live ``response.done`` payload extractor. + + Includes audio / text / cached sub-counts when present. + """ + usage = _get(response_obj, "usage") + if usage is None: + return None + + inp = _to_int(_get(usage, "input_tokens")) + out = _to_int(_get(usage, "output_tokens")) + tot = _to_int(_get(usage, "total_tokens")) + if tot == 0 and (inp or out): + tot = inp + out + + in_details = _get(usage, "input_token_details") or {} + out_details = _get(usage, "output_token_details") or {} + + record = TokenUsage( + input_tokens=inp, + output_tokens=out, + total_tokens=tot, + input_audio_tokens=_to_int(_get(in_details, "audio_tokens")), + input_text_tokens=_to_int(_get(in_details, "text_tokens")), + input_cached_tokens=_to_int(_get(in_details, "cached_tokens")), + output_audio_tokens=_to_int(_get(out_details, "audio_tokens")), + output_text_tokens=_to_int(_get(out_details, "text_tokens")), + ) + if record.has_any or any( + v for v in ( + record.input_audio_tokens, + record.input_text_tokens, + record.input_cached_tokens, + record.output_audio_tokens, + record.output_text_tokens, + ) + ): + return record + return None + + +# --------------------------------------------------------------------------- +# Tool / sub-agent attribution +# --------------------------------------------------------------------------- +def detect_invoked_tools(result: Any) -> set[str]: + """Return the set of tool/function names invoked in an agent result. + + Used by orchestrators that expose sub-agents via ``.as_tool()`` to attribute + token usage only to the sub-agents that were actually called. Never raises. + """ + invoked: set[str] = set() + try: + messages = _get(result, "messages") + if not _is_iterable(messages): + return invoked + for msg in messages: + contents = _get(msg, "contents") + if not _is_iterable(contents): + continue + for content in contents: + if _get(content, "type") == "function_call": + name = _get(content, "name") + if name: + invoked.add(str(name)) + except Exception as exc: + logger.debug("detect_invoked_tools failed: %s", exc, exc_info=True) + return invoked + + +# --------------------------------------------------------------------------- +# Event sink (optional Application Insights dependency) +# --------------------------------------------------------------------------- +EventSink = Callable[[str, Mapping[str, str]], None] + + +def _default_event_sink() -> Optional[EventSink]: + """Return ``azure.monitor.events.extension.track_event`` if importable, + else ``None``.""" + try: + from azure.monitor.events.extension import track_event # type: ignore + except Exception: # pragma: no cover - optional dep + return None + return track_event + + +# --------------------------------------------------------------------------- +# Emitter +# --------------------------------------------------------------------------- +class TokenUsageEmitter: + """Emit standardized token-usage custom events to Application Insights. + + Responsibilities: + 1. Emit LLM_Agent_Token_Usage, LLM_Model_Token_Usage, and + LLM_Token_Usage_Summary events with consistent property schemas. + 2. Optionally sample high-cardinality events while always emitting + the summary event for accurate per-request totals. + 3. Support per-model pricing for estimated cost calculation. + 4. Hash user_id values for PII/GDPR compliance when configured. + + Attributes: + perf_slow_emit_threshold_ms: Soft threshold (ms) above which a + WARNING is logged for an individual emit call. + """ + + def __init__( + self, + *, + connection_string: Optional[str] = None, + static_dimensions: Optional[Mapping[str, Any]] = None, + event_sink: Optional[EventSink] = None, + pricing: Optional[Mapping[str, tuple[float, float]]] = None, + user_id_hasher: Optional[Callable[[str], str]] = None, + sample_rate: float = 1.0, + logger: Optional[logging.Logger] = None, + ) -> None: + self._cs = connection_string if connection_string is not None else os.getenv( + "APPLICATIONINSIGHTS_CONNECTION_STRING" + ) + self._sink = event_sink if event_sink is not None else _default_event_sink() + self._log = logger or logging.getLogger(__name__) + + self._user_id_hasher = user_id_hasher + + try: + sr = float(sample_rate) + except (TypeError, ValueError): + sr = 1.0 + self._sample_rate = max(0.0, min(1.0, sr)) + + self._pricing: dict[str, tuple[float, float]] = {} + for model, rates in (pricing or {}).items(): + if not model or rates is None: + continue + try: + inp, out = rates + self._pricing[str(model).lower()] = (float(inp), float(out)) + except (TypeError, ValueError): + self._log.warning("Ignoring malformed pricing entry: %s=%r", model, rates) + + raw_static = dict(static_dimensions or {}) + if "user_id" in raw_static: + raw_static["user_id"] = self._apply_user_id_hash(raw_static["user_id"]) + self._static: dict[str, str] = { + k: ("" if v is None else str(v)) for k, v in raw_static.items() + } + + self._perf_total_ns: int = 0 + self._perf_emit_count: int = 0 + self._perf_max_ns: int = 0 + self.perf_slow_emit_threshold_ms: float = 50.0 + + @property + def enabled(self) -> bool: + """True when App Insights connection string and event sink are available.""" + return bool(self._cs) and self._sink is not None + + @property + def sample_rate(self) -> float: + """Current sampling rate for high-cardinality events.""" + return self._sample_rate + + def _apply_user_id_hash(self, value: Any) -> Any: + """Apply the configured user_id_hasher; never raises.""" + if value is None or value == "" or self._user_id_hasher is None: + return value + try: + return self._user_id_hasher(str(value)) + except Exception as exc: + self._log.warning("user_id_hasher raised: %s", exc) + return value + + def _should_sample(self) -> bool: + """Sampling decision for high-cardinality events.""" + if self._sample_rate >= 1.0: + return True + if self._sample_rate <= 0.0: + return False + return random.random() < self._sample_rate + + def _cost_props( + self, model_deployment_name: Optional[str], usage: TokenUsage + ) -> dict[str, str]: + """Return ``{'estimated_cost_usd': '...'}`` when pricing is configured.""" + if not self._pricing or not model_deployment_name: + return {} + rate = self._pricing.get(model_deployment_name.lower()) + if not rate: + return {} + inp_rate, out_rate = rate + cost = (usage.input_tokens * inp_rate + usage.output_tokens * out_rate) / 1000.0 + return {"estimated_cost_usd": f"{cost:.6f}"} + + def _summary_cost_props( + self, + primary_model: Optional[str], + additional_agents: Mapping[str, str], + usage: TokenUsage, + ) -> dict[str, str]: + """Best-effort cost for the summary event.""" + if primary_model: + cost = self._cost_props(primary_model, usage) + if cost: + return cost + for m in additional_agents.values(): + cost = self._cost_props(m, usage) + if cost: + return cost + return {} + + def emit(self, event_name: str, **dimensions: Any) -> None: + """Low-level: emit an event with arbitrary properties. Never raises.""" + start_ns = time.perf_counter_ns() + try: + props = dict(self._static) + for k, v in dimensions.items(): + if v is None: + continue + if k == "user_id": + v = self._apply_user_id_hash(v) + if v is None or v == "": + continue + props[k] = v if isinstance(v, str) else str(v) + + if not self.enabled: + self._log.debug( + "App Insights not configured -- skipping event %s (%s)", + event_name, props, + ) + return + try: + self._sink(event_name, props) # type: ignore[misc] + except Exception as exc: + self._log.warning("track_event(%s) failed: %s", event_name, exc) + finally: + elapsed_ns = time.perf_counter_ns() - start_ns + self._perf_total_ns += elapsed_ns + self._perf_emit_count += 1 + if elapsed_ns > self._perf_max_ns: + self._perf_max_ns = elapsed_ns + elapsed_ms = elapsed_ns / 1_000_000.0 + if elapsed_ms > self.perf_slow_emit_threshold_ms: + self._log.warning( + "Token telemetry emit slow: event=%s duration_ms=%.3f", + event_name, elapsed_ms, + ) + else: + self._log.debug( + "Token telemetry emit: event=%s duration_ms=%.3f", + event_name, elapsed_ms, + ) + + def perf_stats(self) -> dict[str, float]: + """Return cumulative telemetry-overhead stats. + + Returns: + Dict with keys: emit_count, total_ms, avg_ms, max_ms. + """ + count = self._perf_emit_count + total_ms = self._perf_total_ns / 1_000_000.0 + return { + "emit_count": float(count), + "total_ms": total_ms, + "avg_ms": (total_ms / count) if count else 0.0, + "max_ms": self._perf_max_ns / 1_000_000.0, + } + + def reset_perf_stats(self) -> None: + """Zero the perf counters.""" + self._perf_total_ns = 0 + self._perf_emit_count = 0 + self._perf_max_ns = 0 + + def emit_agent( + self, + *, + agent_name: str, + model_deployment_name: str, + usage: TokenUsage, + **dimensions: Any, + ) -> None: + """Emit a per-agent token usage event.""" + if not usage.has_any or not self._should_sample(): + return + self.emit( + EVENT_AGENT, + agent_name=agent_name, + model_deployment_name=model_deployment_name, + **usage.to_event_props(), + **self._cost_props(model_deployment_name, usage), + **dimensions, + ) + + def emit_model( + self, + *, + model_deployment_name: str, + usage: TokenUsage, + **dimensions: Any, + ) -> None: + """Emit a per-model token usage event.""" + if not usage.has_any or not self._should_sample(): + return + self.emit( + EVENT_MODEL, + model_deployment_name=model_deployment_name, + **usage.to_event_props(), + **self._cost_props(model_deployment_name, usage), + **dimensions, + ) + + def emit_user( + self, + *, + user_id: str, + usage: TokenUsage, + **dimensions: Any, + ) -> None: + """Emit a per-user token usage event.""" + if not usage.has_any or not user_id or not self._should_sample(): + return + self.emit( + EVENT_USER, + user_id=user_id, + **usage.to_event_props(), + **dimensions, + ) + + def emit_team( + self, + *, + team_name: str, + usage: TokenUsage, + **dimensions: Any, + ) -> None: + """Emit a per-team token usage event.""" + if not usage.has_any or not team_name or not self._should_sample(): + return + self.emit( + EVENT_TEAM, + team_name=team_name, + **usage.to_event_props(), + **dimensions, + ) + + def emit_summary( + self, + *, + usage: TokenUsage, + agent_count: int = 1, + model_count: int = 1, + primary_model: Optional[str] = None, + additional_agents: Optional[Mapping[str, str]] = None, + **dimensions: Any, + ) -> None: + """Emit the summary event (always fires, ignores sample_rate).""" + if not usage.has_any: + return + props = { + "total_input_tokens": str(usage.input_tokens), + "total_output_tokens": str(usage.output_tokens), + "total_tokens": str(usage.total_tokens), + "agent_count": str(agent_count), + "model_count": str(model_count), + "sample_rate": f"{self._sample_rate:.4f}", + } + for k, v in usage.to_event_props().items(): + props.setdefault(k, v) + props.update(self._summary_cost_props(primary_model, additional_agents or {}, usage)) + self.emit(EVENT_SUMMARY, **props, **dimensions) + + def emit_speech( + self, + *, + model_deployment_name: str, + source: str, + usage: TokenUsage, + **dimensions: Any, + ) -> None: + """Voice-Live / realtime speech usage event.""" + if not self._should_sample(): + return + self.emit( + EVENT_SPEECH, + model_deployment_name=model_deployment_name, + source=source, + **usage.to_event_props(), + **self._cost_props(model_deployment_name, usage), + **dimensions, + ) + + def emit_all( + self, + *, + agent_name: str, + model_deployment_name: str, + usage: TokenUsage, + additional_agents: Optional[Mapping[str, str]] = None, + emit_user_event: bool = False, + emit_team_event: bool = False, + **dimensions: Any, + ) -> None: + """Emit summary, agent, and one model event per distinct model deployment. + + Args: + agent_name: Name of the primary agent/step. + model_deployment_name: Model deployment used by the primary agent. + usage: Accumulated token usage for this invocation. + additional_agents: Maps sub-agent name -> model deployment name. + emit_user_event: Opt in to per-user events. + emit_team_event: Opt in to per-team events. + **dimensions: Extra properties forwarded to all events. + """ + if not usage.has_any: + return + + agents = {agent_name: model_deployment_name} + if additional_agents: + agents.update({k: v for k, v in additional_agents.items() if k}) + models = {m for m in agents.values() if m} + + batch_start_ns = time.perf_counter_ns() + + self.emit_agent( + agent_name=agent_name, + model_deployment_name=model_deployment_name, + usage=usage, + **dimensions, + ) + for model in models: + self.emit_model( + model_deployment_name=model, + usage=usage, + **dimensions, + ) + if emit_user_event and dimensions.get("user_id"): + self.emit_user( + user_id=str(dimensions["user_id"]), + usage=usage, + agent_name=agent_name, + model_deployment_name=model_deployment_name, + ) + if emit_team_event and dimensions.get("team_name"): + self.emit_team( + team_name=str(dimensions["team_name"]), + usage=usage, + agent_name=agent_name, + model_deployment_name=model_deployment_name, + ) + + batch_overhead_ms = (time.perf_counter_ns() - batch_start_ns) / 1_000_000.0 + self.emit_summary( + usage=usage, + agent_count=len(agents), + model_count=len(models) or 1, + primary_model=model_deployment_name, + additional_agents=additional_agents, + telemetry_overhead_ms=f"{batch_overhead_ms:.3f}", + **dimensions, + ) + + self._log.info( + "[TOKEN USAGE] agent=%s model=%s input=%d output=%d total=%d %s", + agent_name, + model_deployment_name, + usage.input_tokens, + usage.output_tokens, + usage.total_tokens, + " ".join(f"{k}={v}" for k, v in dimensions.items() if v), + ) + + +# --------------------------------------------------------------------------- +# Scope / decorator sugar +# --------------------------------------------------------------------------- +@dataclass +class TokenUsageScope(AbstractContextManager): + """Accumulate usage across multiple results, then emit on exit. + + Example:: + + with TokenUsageScope(emitter, + agent_name="MapHandler", + model_deployment_name=cfg.model, + process_id=pid) as scope: + result = await agent.run(prompt) + scope.add(result) + + Attributes: + emitter: The TokenUsageEmitter instance to use for emission. + agent_name: Name of the agent/step being tracked. + model_deployment_name: Model deployment name for attribution. + dimensions: Extra properties forwarded to all events. + additional_agents: Maps sub-agent name -> model deployment name. + emit_user_event: Whether to emit per-user events. + emit_team_event: Whether to emit per-team events. + usage: Accumulated TokenUsage so far. + """ + + emitter: TokenUsageEmitter + agent_name: str + model_deployment_name: str + dimensions: dict[str, Any] = field(default_factory=dict) + additional_agents: dict[str, str] = field(default_factory=dict) + emit_user_event: bool = False + emit_team_event: bool = False + usage: TokenUsage = field(default_factory=TokenUsage) + + def __init__( + self, + emitter: TokenUsageEmitter, + *, + agent_name: str, + model_deployment_name: str, + additional_agents: Optional[Mapping[str, str]] = None, + emit_user_event: bool = False, + emit_team_event: bool = False, + **dimensions: Any, + ) -> None: + self.emitter = emitter + self.agent_name = agent_name + self.model_deployment_name = model_deployment_name + self.additional_agents = dict(additional_agents or {}) + self.emit_user_event = emit_user_event + self.emit_team_event = emit_team_event + self.dimensions = dict(dimensions) + self.usage = TokenUsage() + self._extract_ns: int = 0 + self._emit_ns: int = 0 + + def add(self, source: Any) -> Optional[TokenUsage]: + """Extract usage from any supported shape and add to the running total. + + Args: + source: Agent run result, ChatMessage, or ChatCompletion object. + + Returns: + The extracted TokenUsage, or None if extraction failed. + """ + start_ns = time.perf_counter_ns() + try: + found = extract_usage(source) or extract_usage_from_stream_chunk(source) + except Exception as exc: + logger.debug("TokenUsageScope.add failed: %s", exc, exc_info=True) + return None + finally: + self._extract_ns += time.perf_counter_ns() - start_ns + if found: + self.usage = self.usage + found + return found + + def add_usage(self, usage: TokenUsage) -> None: + """Add a pre-constructed TokenUsage to the running total.""" + self.usage = self.usage + usage + + def add_chunks(self, chunks: Iterable[Any]) -> None: + """Extract and accumulate usage from a stream of chunks.""" + for c in chunks: + self.add(c) + + @property + def extract_ms(self) -> float: + """Total ms spent inside :meth:`add` / :meth:`add_chunks`.""" + return self._extract_ns / 1_000_000.0 + + @property + def emit_ms(self) -> float: + """Total ms spent in the on-exit emit batch.""" + return self._emit_ns / 1_000_000.0 + + @property + def total_overhead_ms(self) -> float: + """Total telemetry overhead added by this scope (extract + emit).""" + return self.extract_ms + self.emit_ms + + def __exit__(self, exc_type, exc, tb) -> None: + emit_start_ns = time.perf_counter_ns() + try: + self.emitter.emit_all( + agent_name=self.agent_name, + model_deployment_name=self.model_deployment_name, + usage=self.usage, + additional_agents=self.additional_agents, + emit_user_event=self.emit_user_event, + emit_team_event=self.emit_team_event, + **self.dimensions, + ) + except Exception as emit_exc: # pragma: no cover + logger.warning("TokenUsageScope emit failed: %s", emit_exc) + finally: + self._emit_ns += time.perf_counter_ns() - emit_start_ns + logger.debug( + "TokenUsageScope overhead: agent=%s extract_ms=%.3f " + "emit_ms=%.3f total_ms=%.3f", + self.agent_name, + self.extract_ms, + self.emit_ms, + self.total_overhead_ms, + ) + return None + + +def track_tokens( + emitter: TokenUsageEmitter, + *, + agent_name: str, + model_deployment_name: str, + dimension_args: Optional[Mapping[str, str]] = None, + additional_agents: Optional[Mapping[str, str]] = None, + emit_user_event: bool = False, + emit_team_event: bool = False, +): + """Decorator: wrap an async or sync function that returns an LLM result. + + Args: + emitter: TokenUsageEmitter to use. + agent_name: Name of the agent/step. + model_deployment_name: Model deployment name. + dimension_args: Maps emitted-property-name -> callable-keyword-argument. + additional_agents: Sub-agent name -> model deployment name mapping. + emit_user_event: Opt in to per-user events. + emit_team_event: Opt in to per-team events. + """ + + dim_args = dict(dimension_args or {}) + + def _decorator(fn: Callable[..., Any]): + is_coro = _is_coroutine_function(fn) + + if is_coro: + @functools.wraps(fn) + async def _aw(*args, **kwargs) -> Any: + with _scope_for(kwargs) as scope: + result = await fn(*args, **kwargs) + scope.add(result) + return result + return _aw + + @functools.wraps(fn) + def _sw(*args, **kwargs) -> Any: + with _scope_for(kwargs) as scope: + result = fn(*args, **kwargs) + scope.add(result) + return result + return _sw + + def _scope_for(call_kwargs: Mapping[str, Any]) -> TokenUsageScope: + dimensions = { + prop: call_kwargs.get(kw) + for prop, kw in dim_args.items() + if call_kwargs.get(kw) is not None + } + return TokenUsageScope( + emitter, + agent_name=agent_name, + model_deployment_name=model_deployment_name, + additional_agents=additional_agents, + emit_user_event=emit_user_event, + emit_team_event=emit_team_event, + **dimensions, + ) + + return _decorator + + +def _is_coroutine_function(fn: Callable[..., Any]) -> bool: + return asyncio.iscoroutinefunction(fn) + + +__all__ = [ + "EVENT_SUMMARY", + "EVENT_AGENT", + "EVENT_MODEL", + "EVENT_USER", + "EVENT_TEAM", + "EVENT_SPEECH", + "TokenUsage", + "TokenUsageEmitter", + "TokenUsageScope", + "track_tokens", + "extract_usage", + "extract_usage_from_dict", + "extract_usage_from_stream_chunk", + "extract_realtime_usage", + "detect_invoked_tools", +] diff --git a/src/ContentProcessorWorkflow/src/libs/telemetry.py b/src/ContentProcessorWorkflow/src/libs/telemetry.py new file mode 100644 index 00000000..79e1f4bf --- /dev/null +++ b/src/ContentProcessorWorkflow/src/libs/telemetry.py @@ -0,0 +1,92 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +"""Process-wide telemetry singletons for the content-processing workflow. + +A single :class:`TokenUsageEmitter` is constructed at import time so every +executor shares the same App Insights connection-string resolution and +static dimensions. Importing this module has no side effects beyond reading +``APPLICATIONINSIGHTS_CONNECTION_STRING`` and the env vars documented below. + +Optional environment variables +------------------------------ +LLM_TOKEN_SAMPLE_RATE + Float in [0, 1]. Fraction of high-cardinality token events + (agent/model/user/team/speech) to ship. The summary event always fires. + Defaults to ``1.0``. + +LLM_TOKEN_USER_ID_HMAC_KEY + When set, ``user_id`` values are replaced with an HMAC-SHA256 hex digest + (truncated to 16 chars) before leaving the process. Use to satisfy + GDPR / PII handling requirements without modifying call sites. + +LLM_TOKEN_PRICING + Optional comma-separated list of ``model=in_per_1k:out_per_1k`` entries, + e.g. ``gpt-4o=0.0025:0.01,gpt-4o-mini=0.00015:0.0006``. When set the + emitter attaches ``estimated_cost_usd`` to agent / model / summary + events so dashboards can group by cost without hard-coded KQL rates. +""" +from __future__ import annotations + +import hashlib +import hmac +import logging +import os +from typing import Callable, Optional + +from libs.llm_token_telemetry import TokenUsageEmitter + +_log = logging.getLogger(__name__) + + +def _parse_sample_rate() -> float: + raw = os.getenv("LLM_TOKEN_SAMPLE_RATE") + if not raw: + return 1.0 + try: + return max(0.0, min(1.0, float(raw))) + except ValueError: + _log.warning("Invalid LLM_TOKEN_SAMPLE_RATE=%r; defaulting to 1.0", raw) + return 1.0 + + +def _build_user_id_hasher() -> Optional[Callable[[str], str]]: + key = os.getenv("LLM_TOKEN_USER_ID_HMAC_KEY") + if not key: + return None + key_bytes = key.encode("utf-8") + + def _hash(value: str) -> str: + digest = hmac.new(key_bytes, value.encode("utf-8"), hashlib.sha256).hexdigest() + return digest[:16] + + return _hash + + +def _parse_pricing() -> dict[str, tuple[float, float]]: + raw = os.getenv("LLM_TOKEN_PRICING") + if not raw: + return {} + pricing: dict[str, tuple[float, float]] = {} + for entry in raw.split(","): + entry = entry.strip() + if not entry or "=" not in entry: + continue + model, rates = entry.split("=", 1) + if ":" not in rates: + continue + in_s, out_s = rates.split(":", 1) + try: + pricing[model.strip().lower()] = (float(in_s), float(out_s)) + except ValueError: + _log.warning("Ignoring malformed pricing entry: %s", entry) + return pricing + + +token_emitter = TokenUsageEmitter( + static_dimensions={"app": "content-processing"}, + sample_rate=_parse_sample_rate(), + user_id_hasher=_build_user_id_hasher(), + pricing=_parse_pricing(), +) + +__all__ = ["token_emitter"] diff --git a/src/ContentProcessorWorkflow/src/libs/token_usage_utils.py b/src/ContentProcessorWorkflow/src/libs/token_usage_utils.py deleted file mode 100644 index b88c5cd5..00000000 --- a/src/ContentProcessorWorkflow/src/libs/token_usage_utils.py +++ /dev/null @@ -1,245 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -"""Token usage tracking for LLM calls in the content processing pipeline. - -Extracts token counts from Azure OpenAI agent framework responses and emits -custom events to Application Insights for monitoring, cost estimation, and -performance optimization. -""" - -import logging -import os -from typing import Any - -logger = logging.getLogger(__name__) - - -def _track_event_if_configured(event_name: str, event_data: dict) -> None: - """Track a custom event to Application Insights if configured. - - Args: - event_name: Name of the custom event. - event_data: Dictionary of event properties (all values must be strings). - """ - connection_string = os.getenv("APPLICATIONINSIGHTS_CONNECTION_STRING") - if connection_string: - try: - from azure.monitor.events.extension import track_event - - track_event(event_name, event_data) - except Exception as exc: - logger.warning("Failed to track event '%s': %s", event_name, exc) - else: - logger.debug( - "Skipping track_event for %s: Application Insights is not configured", - event_name, - ) - - -def extract_token_usage(response: Any) -> dict[str, int]: - """Extract token usage from an agent framework ChatMessage response. - - Checks multiple attribute paths to handle different response shapes - from the agent framework SDK. - - Args: - response: The ChatMessage response object from agent.run(). - - Returns: - Dict with keys: input_tokens, output_tokens, total_tokens. - All default to 0 if not found. - """ - input_tokens = 0 - output_tokens = 0 - total_tokens = 0 - - # Path 1: usage_details attribute (set by agent framework SDK) - usage_details = getattr(response, "usage_details", None) - if usage_details is not None: - if isinstance(usage_details, dict): - input_tokens = _to_int( - usage_details.get("input_token_count") - or usage_details.get("prompt_tokens") - or usage_details.get("input_tokens") - ) - output_tokens = _to_int( - usage_details.get("output_token_count") - or usage_details.get("completion_tokens") - or usage_details.get("output_tokens") - ) - total_tokens = _to_int( - usage_details.get("total_token_count") - or usage_details.get("total_tokens") - ) or (input_tokens + output_tokens) - else: - # UsageDetails object with attributes - input_tokens = _to_int( - getattr(usage_details, "input_token_count", 0) - or getattr(usage_details, "prompt_tokens", 0) - ) - output_tokens = _to_int( - getattr(usage_details, "output_token_count", 0) - or getattr(usage_details, "completion_tokens", 0) - ) - total_tokens = _to_int( - getattr(usage_details, "total_token_count", 0) - ) or (input_tokens + output_tokens) - - # Path 2: raw_representation.usage (raw Azure OpenAI response) - if total_tokens == 0: - raw = getattr(response, "raw_representation", None) - if raw is not None: - usage_obj = getattr(raw, "usage", None) - if usage_obj is not None: - if isinstance(usage_obj, dict): - input_tokens = _to_int( - usage_obj.get("prompt_tokens") - or usage_obj.get("input_tokens") - ) - output_tokens = _to_int( - usage_obj.get("completion_tokens") - or usage_obj.get("output_tokens") - ) - total_tokens = _to_int( - usage_obj.get("total_tokens") - ) or (input_tokens + output_tokens) - else: - input_tokens = _to_int( - getattr(usage_obj, "prompt_tokens", 0) - or getattr(usage_obj, "input_tokens", 0) - ) - output_tokens = _to_int( - getattr(usage_obj, "completion_tokens", 0) - or getattr(usage_obj, "output_tokens", 0) - ) - total_tokens = _to_int( - getattr(usage_obj, "total_tokens", 0) - ) or (input_tokens + output_tokens) - - return { - "input_tokens": input_tokens, - "output_tokens": output_tokens, - "total_tokens": total_tokens, - } - - -def emit_agent_token_event( - agent_name: str, - model_deployment_name: str, - usage: dict[str, int], - process_id: str = "", -) -> None: - """Emit a per-agent token usage event to Application Insights. - - Args: - agent_name: Name of the pipeline step/agent (e.g. 'MapHandler', 'RAI'). - model_deployment_name: Azure OpenAI model deployment name. - usage: Dict with input_tokens, output_tokens, total_tokens. - process_id: Document processing ID for correlation. - """ - _track_event_if_configured("LLM_Agent_Token_Usage", { - "agent_name": agent_name, - "input_tokens": str(usage.get("input_tokens", 0)), - "output_tokens": str(usage.get("output_tokens", 0)), - "total_tokens": str(usage.get("total_tokens", 0)), - "model_deployment_name": model_deployment_name, - "process_id": process_id, - }) - logger.info( - "[TOKEN USAGE] agent=%s model=%s input=%d output=%d total=%d process=%s", - agent_name, - model_deployment_name, - usage.get("input_tokens", 0), - usage.get("output_tokens", 0), - usage.get("total_tokens", 0), - process_id, - ) - - -def emit_model_token_event( - model_deployment_name: str, - usage: dict[str, int], - process_id: str = "", -) -> None: - """Emit a per-model token usage event to Application Insights. - - Args: - model_deployment_name: Azure OpenAI model deployment name. - usage: Dict with input_tokens, output_tokens, total_tokens. - process_id: Document processing ID for correlation. - """ - _track_event_if_configured("LLM_Model_Token_Usage", { - "model_deployment_name": model_deployment_name, - "input_tokens": str(usage.get("input_tokens", 0)), - "output_tokens": str(usage.get("output_tokens", 0)), - "total_tokens": str(usage.get("total_tokens", 0)), - "process_id": process_id, - }) - - -def emit_summary_token_event( - total_input_tokens: int, - total_output_tokens: int, - total_tokens: int, - process_id: str = "", - file_name: str = "", - file_mime_type: str = "", - agent_count: int = 0, - model_count: int = 0, -) -> None: - """Emit a summary token usage event for a complete document processing run. - - Args: - total_input_tokens: Sum of all input tokens across all steps. - total_output_tokens: Sum of all output tokens across all steps. - total_tokens: Sum of all tokens across all steps. - process_id: Document processing ID. - file_name: Name of the processed file. - file_mime_type: MIME type of the processed file. - agent_count: Number of agents/steps that used tokens. - model_count: Number of distinct models used. - """ - _track_event_if_configured("LLM_Token_Usage_Summary", { - "total_input_tokens": str(total_input_tokens), - "total_output_tokens": str(total_output_tokens), - "total_tokens": str(total_tokens), - "process_id": process_id, - "file_name": file_name, - "file_mime_type": file_mime_type, - "agent_count": str(agent_count), - "model_count": str(model_count), - }) - logger.info( - "[TOKEN SUMMARY] process=%s file=%s input=%d output=%d total=%d agents=%d models=%d", - process_id, - file_name, - total_input_tokens, - total_output_tokens, - total_tokens, - agent_count, - model_count, - ) - - -def _to_int(val: object, default: int = 0) -> int: - """Safely convert a value to int. - - Args: - val: Value to convert. - default: Default if conversion fails. - - Returns: - Integer value or default. - """ - if val is None or isinstance(val, bool): - return default - if isinstance(val, int): - return val - if isinstance(val, float): - return int(val) - if isinstance(val, str): - s = val.strip() - if s.isdigit(): - return int(s) - return default diff --git a/src/ContentProcessorWorkflow/src/steps/gap_analysis/executor/gap_executor.py b/src/ContentProcessorWorkflow/src/steps/gap_analysis/executor/gap_executor.py index 9d21d555..272566c7 100644 --- a/src/ContentProcessorWorkflow/src/steps/gap_analysis/executor/gap_executor.py +++ b/src/ContentProcessorWorkflow/src/steps/gap_analysis/executor/gap_executor.py @@ -31,7 +31,8 @@ from steps.models.extracted_file import ExtractedFile from steps.models.output import Executor_Output, Workflow_Output -from libs.token_usage_utils import emit_agent_token_event, extract_token_usage +from libs.llm_token_telemetry import TokenUsageScope +from libs.telemetry import token_emitter class GapExecutor(Executor): @@ -195,14 +196,14 @@ async def handle_execute( ) # Track token usage for gap analysis - token_usage = extract_token_usage(model_response) model_name = agent_framework_helper.settings.get_service_config("default").chat_deployment_name - emit_agent_token_event( + with TokenUsageScope( + token_emitter, agent_name="GapAnalysis", model_deployment_name=model_name, - usage=token_usage, process_id=result.claim_process_id, - ) + ) as scope: + scope.add(model_response) claim_process_repository = self.app_context.get_service(Claim_Processes) await claim_process_repository.Update_Claim_Process_Gaps( diff --git a/src/ContentProcessorWorkflow/src/steps/rai/executor/rai_executor.py b/src/ContentProcessorWorkflow/src/steps/rai/executor/rai_executor.py index 9a33735f..3743f4a9 100644 --- a/src/ContentProcessorWorkflow/src/steps/rai/executor/rai_executor.py +++ b/src/ContentProcessorWorkflow/src/steps/rai/executor/rai_executor.py @@ -27,7 +27,8 @@ from services.content_process_service import ContentProcessService from steps.rai.model import rai_response -from libs.token_usage_utils import emit_agent_token_event, extract_token_usage +from libs.llm_token_telemetry import TokenUsageScope +from libs.telemetry import token_emitter class RAIExecutor(Executor): @@ -189,14 +190,14 @@ async def handle_exectue( ) # Track token usage for RAI check - token_usage = extract_token_usage(model_response) model_name = agent_framework_helper.settings.get_service_config("default").chat_deployment_name - emit_agent_token_event( + with TokenUsageScope( + token_emitter, agent_name="RAI", model_deployment_name=model_name, - usage=token_usage, process_id=result.claim_process_id, - ) + ) as scope: + scope.add(model_response) response_content = model_response.text parsed_response = rai_response.RAIResponse.model_validate_json(response_content) diff --git a/src/ContentProcessorWorkflow/src/steps/summarize/executor/summarize_executor.py b/src/ContentProcessorWorkflow/src/steps/summarize/executor/summarize_executor.py index f2e5c8ca..c8400ffa 100644 --- a/src/ContentProcessorWorkflow/src/steps/summarize/executor/summarize_executor.py +++ b/src/ContentProcessorWorkflow/src/steps/summarize/executor/summarize_executor.py @@ -28,7 +28,8 @@ from steps.models.extracted_file import ExtractedFile from steps.models.output import Executor_Output, Workflow_Output -from libs.token_usage_utils import emit_agent_token_event, extract_token_usage +from libs.llm_token_telemetry import TokenUsageScope +from libs.telemetry import token_emitter class SummarizeExecutor(Executor): @@ -195,14 +196,14 @@ async def handle_execute( ) # Track token usage for summarization - token_usage = extract_token_usage(model_response) model_name = agent_framework_helper.settings.get_service_config("default").chat_deployment_name - emit_agent_token_event( + with TokenUsageScope( + token_emitter, agent_name="Summarize", model_deployment_name=model_name, - usage=token_usage, process_id=result.claim_process_id, - ) + ) as scope: + scope.add(model_response) summarized_result = {"status": "summarized", "input": model_response.text} From 5d85f659d7cb1d02b659dd8990b34c6886ef4d6f Mon Sep 17 00:00:00 2001 From: Prachig-Microsoft Date: Mon, 1 Jun 2026 16:20:04 +0530 Subject: [PATCH 19/23] fix: remove unused import and trailing blank line in test_token_usage_utils Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/ContentProcessor/tests/unit/libs/test_token_usage_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/ContentProcessor/tests/unit/libs/test_token_usage_utils.py b/src/ContentProcessor/tests/unit/libs/test_token_usage_utils.py index 5e9c1b83..f68cc76e 100644 --- a/src/ContentProcessor/tests/unit/libs/test_token_usage_utils.py +++ b/src/ContentProcessor/tests/unit/libs/test_token_usage_utils.py @@ -5,7 +5,7 @@ from __future__ import annotations -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock from libs.llm_token_telemetry import ( TokenUsage, @@ -369,4 +369,3 @@ def test_scope_accumulates_multiple_adds(self): assert scope.usage.input_tokens == 300 assert scope.usage.output_tokens == 130 assert scope.usage.total_tokens == 430 - From a8fc83f6b27bcd4b8e601b2b263016a3f5c0f4c1 Mon Sep 17 00:00:00 2001 From: Prachig-Microsoft Date: Mon, 1 Jun 2026 18:20:22 +0530 Subject: [PATCH 20/23] fix: add file_name and file_mime_type to token usage telemetry events - Pass file metadata in TokenUsageScope for MapHandler, Summarize, GapAnalysis, and RAI executors so the workbook file-type query works - Fix dedup in model cost KQL query (workbook + queries file) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../src/libs/pipeline/handlers/map_handler.py | 3 +++ .../src/steps/gap_analysis/executor/gap_executor.py | 4 ++++ .../src/steps/rai/executor/rai_executor.py | 4 ++++ .../src/steps/summarize/executor/summarize_executor.py | 4 ++++ 4 files changed, 15 insertions(+) diff --git a/src/ContentProcessor/src/libs/pipeline/handlers/map_handler.py b/src/ContentProcessor/src/libs/pipeline/handlers/map_handler.py index 3b31e7c8..2bcfb5ad 100644 --- a/src/ContentProcessor/src/libs/pipeline/handlers/map_handler.py +++ b/src/ContentProcessor/src/libs/pipeline/handlers/map_handler.py @@ -266,11 +266,14 @@ async def execute(self, context: MessageContext) -> StepResult: ) # Track token usage for this LLM call + source_file = context.data_pipeline.get_source_files()[0] with TokenUsageScope( token_emitter, agent_name="MapHandler", model_deployment_name=self.application_context.configuration.app_azure_openai_model, process_id=context.data_pipeline.pipeline_status.process_id, + file_name=source_file.name, + file_mime_type=source_file.mime_type or "", ) as scope: scope.add(gpt_response) diff --git a/src/ContentProcessorWorkflow/src/steps/gap_analysis/executor/gap_executor.py b/src/ContentProcessorWorkflow/src/steps/gap_analysis/executor/gap_executor.py index 272566c7..24c5785c 100644 --- a/src/ContentProcessorWorkflow/src/steps/gap_analysis/executor/gap_executor.py +++ b/src/ContentProcessorWorkflow/src/steps/gap_analysis/executor/gap_executor.py @@ -197,11 +197,15 @@ async def handle_execute( # Track token usage for gap analysis model_name = agent_framework_helper.settings.get_service_config("default").chat_deployment_name + file_names = ", ".join(f.file_name for f in processed_files) if processed_files else "" + file_types = ", ".join(set(f.mime_type for f in processed_files if f.mime_type)) if processed_files else "" with TokenUsageScope( token_emitter, agent_name="GapAnalysis", model_deployment_name=model_name, process_id=result.claim_process_id, + file_name=file_names, + file_mime_type=file_types, ) as scope: scope.add(model_response) diff --git a/src/ContentProcessorWorkflow/src/steps/rai/executor/rai_executor.py b/src/ContentProcessorWorkflow/src/steps/rai/executor/rai_executor.py index 3743f4a9..de56fd9f 100644 --- a/src/ContentProcessorWorkflow/src/steps/rai/executor/rai_executor.py +++ b/src/ContentProcessorWorkflow/src/steps/rai/executor/rai_executor.py @@ -191,11 +191,15 @@ async def handle_exectue( # Track token usage for RAI check model_name = agent_framework_helper.settings.get_service_config("default").chat_deployment_name + file_names = ", ".join(f.file_name for f in processed_files) if processed_files else "" + file_types = ", ".join(set(f.mime_type for f in processed_files if f.mime_type)) if processed_files else "" with TokenUsageScope( token_emitter, agent_name="RAI", model_deployment_name=model_name, process_id=result.claim_process_id, + file_name=file_names, + file_mime_type=file_types, ) as scope: scope.add(model_response) diff --git a/src/ContentProcessorWorkflow/src/steps/summarize/executor/summarize_executor.py b/src/ContentProcessorWorkflow/src/steps/summarize/executor/summarize_executor.py index c8400ffa..23bd30ce 100644 --- a/src/ContentProcessorWorkflow/src/steps/summarize/executor/summarize_executor.py +++ b/src/ContentProcessorWorkflow/src/steps/summarize/executor/summarize_executor.py @@ -197,11 +197,15 @@ async def handle_execute( # Track token usage for summarization model_name = agent_framework_helper.settings.get_service_config("default").chat_deployment_name + file_names = ", ".join(f.file_name for f in processed_files) if processed_files else "" + file_types = ", ".join(set(f.mime_type for f in processed_files if f.mime_type)) if processed_files else "" with TokenUsageScope( token_emitter, agent_name="Summarize", model_deployment_name=model_name, process_id=result.claim_process_id, + file_name=file_names, + file_mime_type=file_types, ) as scope: scope.add(model_response) From 0d55e8c6cedc0efd1605fab5dd834b9dfa9e2828 Mon Sep 17 00:00:00 2001 From: Prachig-Microsoft Date: Tue, 2 Jun 2026 15:41:26 +0530 Subject: [PATCH 21/23] feat: add deterministic event_id for deduplication in telemetry events Add SHA-256 based event_id (event_name|process_id|agent_name|model) to each emitted event, enabling KQL dedup without accelerator-specific max() patterns. This makes the workbook generic across accelerators. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../src/libs/llm_token_telemetry.py | 13 +++++++++++++ .../src/libs/llm_token_telemetry.py | 13 +++++++++++++ 2 files changed, 26 insertions(+) diff --git a/src/ContentProcessor/src/libs/llm_token_telemetry.py b/src/ContentProcessor/src/libs/llm_token_telemetry.py index 7e838b5a..0e541284 100644 --- a/src/ContentProcessor/src/libs/llm_token_telemetry.py +++ b/src/ContentProcessor/src/libs/llm_token_telemetry.py @@ -48,6 +48,7 @@ import asyncio import functools +import hashlib import logging import os import random @@ -526,6 +527,18 @@ def emit(self, event_name: str, **dimensions: Any) -> None: continue props[k] = v if isinstance(v, str) else str(v) + # Deterministic event_id for deduplication across services. + # Key fields: event_name + process_id + agent_name + model + dedup_parts = [ + event_name, + props.get("process_id", ""), + props.get("agent_name", ""), + props.get("model_deployment_name", ""), + ] + props["event_id"] = hashlib.sha256( + "|".join(dedup_parts).encode() + ).hexdigest()[:16] + if not self.enabled: self._log.debug( "App Insights not configured -- skipping event %s (%s)", diff --git a/src/ContentProcessorWorkflow/src/libs/llm_token_telemetry.py b/src/ContentProcessorWorkflow/src/libs/llm_token_telemetry.py index 7e838b5a..0e541284 100644 --- a/src/ContentProcessorWorkflow/src/libs/llm_token_telemetry.py +++ b/src/ContentProcessorWorkflow/src/libs/llm_token_telemetry.py @@ -48,6 +48,7 @@ import asyncio import functools +import hashlib import logging import os import random @@ -526,6 +527,18 @@ def emit(self, event_name: str, **dimensions: Any) -> None: continue props[k] = v if isinstance(v, str) else str(v) + # Deterministic event_id for deduplication across services. + # Key fields: event_name + process_id + agent_name + model + dedup_parts = [ + event_name, + props.get("process_id", ""), + props.get("agent_name", ""), + props.get("model_deployment_name", ""), + ] + props["event_id"] = hashlib.sha256( + "|".join(dedup_parts).encode() + ).hexdigest()[:16] + if not self.enabled: self._log.debug( "App Insights not configured -- skipping event %s (%s)", From f4ee50e778b192410c1808fa1b0b95db1752ac0e Mon Sep 17 00:00:00 2001 From: Prachig-Microsoft Date: Tue, 2 Jun 2026 15:48:01 +0530 Subject: [PATCH 22/23] Address PR #586 review comments - Change emit_all() logging from INFO to DEBUG to reduce production log volume - Optional token-detail fields default to None when absent instead of 0 - Remove redundant extract_usage_from_stream_chunk() fallback in TokenUsageScope.add() - Pin azure-monitor-events-extension to ==0.1.0 in both projects - Revert enableMonitoring to false in default parameters Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- infra/main.parameters.json | 2 +- src/ContentProcessor/pyproject.toml | 2 +- .../src/libs/llm_token_telemetry.py | 14 +++++++------- src/ContentProcessorWorkflow/pyproject.toml | 2 +- .../src/libs/llm_token_telemetry.py | 14 +++++++------- 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/infra/main.parameters.json b/infra/main.parameters.json index de82c938..3cd4bcaa 100644 --- a/infra/main.parameters.json +++ b/infra/main.parameters.json @@ -36,7 +36,7 @@ "value": "${AZURE_ENV_IMAGETAG=latest_v2}" }, "enableMonitoring": { - "value": true + "value": false } } } \ No newline at end of file diff --git a/src/ContentProcessor/pyproject.toml b/src/ContentProcessor/pyproject.toml index b4b8172a..e513a380 100644 --- a/src/ContentProcessor/pyproject.toml +++ b/src/ContentProcessor/pyproject.toml @@ -9,7 +9,7 @@ dependencies = [ "azure-ai-inference==1.0.0b9", "azure-appconfiguration==1.8.0", "azure-identity==1.26.0b1", - "azure-monitor-events-extension>=0.1.0", + "azure-monitor-events-extension==0.1.0", "azure-monitor-opentelemetry==1.8.7", "azure-storage-blob==12.29.0b1", "azure-storage-queue==12.16.0b1", diff --git a/src/ContentProcessor/src/libs/llm_token_telemetry.py b/src/ContentProcessor/src/libs/llm_token_telemetry.py index 0e541284..ae8aa383 100644 --- a/src/ContentProcessor/src/libs/llm_token_telemetry.py +++ b/src/ContentProcessor/src/libs/llm_token_telemetry.py @@ -326,11 +326,11 @@ def extract_realtime_usage(response_obj: Any) -> Optional[TokenUsage]: input_tokens=inp, output_tokens=out, total_tokens=tot, - input_audio_tokens=_to_int(_get(in_details, "audio_tokens")), - input_text_tokens=_to_int(_get(in_details, "text_tokens")), - input_cached_tokens=_to_int(_get(in_details, "cached_tokens")), - output_audio_tokens=_to_int(_get(out_details, "audio_tokens")), - output_text_tokens=_to_int(_get(out_details, "text_tokens")), + input_audio_tokens=_to_int(_get(in_details, "audio_tokens")) if _get(in_details, "audio_tokens") is not None else None, + input_text_tokens=_to_int(_get(in_details, "text_tokens")) if _get(in_details, "text_tokens") is not None else None, + input_cached_tokens=_to_int(_get(in_details, "cached_tokens")) if _get(in_details, "cached_tokens") is not None else None, + output_audio_tokens=_to_int(_get(out_details, "audio_tokens")) if _get(out_details, "audio_tokens") is not None else None, + output_text_tokens=_to_int(_get(out_details, "text_tokens")) if _get(out_details, "text_tokens") is not None else None, ) if record.has_any or any( v for v in ( @@ -776,7 +776,7 @@ def emit_all( **dimensions, ) - self._log.info( + self._log.debug( "[TOKEN USAGE] agent=%s model=%s input=%d output=%d total=%d %s", agent_name, model_deployment_name, @@ -856,7 +856,7 @@ def add(self, source: Any) -> Optional[TokenUsage]: """ start_ns = time.perf_counter_ns() try: - found = extract_usage(source) or extract_usage_from_stream_chunk(source) + found = extract_usage(source) except Exception as exc: logger.debug("TokenUsageScope.add failed: %s", exc, exc_info=True) return None diff --git a/src/ContentProcessorWorkflow/pyproject.toml b/src/ContentProcessorWorkflow/pyproject.toml index 2046388b..ef414f38 100644 --- a/src/ContentProcessorWorkflow/pyproject.toml +++ b/src/ContentProcessorWorkflow/pyproject.toml @@ -14,7 +14,7 @@ dependencies = [ "azure-appconfiguration==1.8.0", "azure-core==1.38.0", "azure-identity==1.26.0b1", - "azure-monitor-events-extension>=0.1.0", + "azure-monitor-events-extension==0.1.0", "azure-monitor-opentelemetry==1.8.7", "azure-storage-blob==12.29.0b1", "azure-storage-file-datalake==12.23.0", diff --git a/src/ContentProcessorWorkflow/src/libs/llm_token_telemetry.py b/src/ContentProcessorWorkflow/src/libs/llm_token_telemetry.py index 0e541284..ae8aa383 100644 --- a/src/ContentProcessorWorkflow/src/libs/llm_token_telemetry.py +++ b/src/ContentProcessorWorkflow/src/libs/llm_token_telemetry.py @@ -326,11 +326,11 @@ def extract_realtime_usage(response_obj: Any) -> Optional[TokenUsage]: input_tokens=inp, output_tokens=out, total_tokens=tot, - input_audio_tokens=_to_int(_get(in_details, "audio_tokens")), - input_text_tokens=_to_int(_get(in_details, "text_tokens")), - input_cached_tokens=_to_int(_get(in_details, "cached_tokens")), - output_audio_tokens=_to_int(_get(out_details, "audio_tokens")), - output_text_tokens=_to_int(_get(out_details, "text_tokens")), + input_audio_tokens=_to_int(_get(in_details, "audio_tokens")) if _get(in_details, "audio_tokens") is not None else None, + input_text_tokens=_to_int(_get(in_details, "text_tokens")) if _get(in_details, "text_tokens") is not None else None, + input_cached_tokens=_to_int(_get(in_details, "cached_tokens")) if _get(in_details, "cached_tokens") is not None else None, + output_audio_tokens=_to_int(_get(out_details, "audio_tokens")) if _get(out_details, "audio_tokens") is not None else None, + output_text_tokens=_to_int(_get(out_details, "text_tokens")) if _get(out_details, "text_tokens") is not None else None, ) if record.has_any or any( v for v in ( @@ -776,7 +776,7 @@ def emit_all( **dimensions, ) - self._log.info( + self._log.debug( "[TOKEN USAGE] agent=%s model=%s input=%d output=%d total=%d %s", agent_name, model_deployment_name, @@ -856,7 +856,7 @@ def add(self, source: Any) -> Optional[TokenUsage]: """ start_ns = time.perf_counter_ns() try: - found = extract_usage(source) or extract_usage_from_stream_chunk(source) + found = extract_usage(source) except Exception as exc: logger.debug("TokenUsageScope.add failed: %s", exc, exc_info=True) return None From efebd812ab32cf6692af68de4511fd8ec0745754 Mon Sep 17 00:00:00 2001 From: Prachig-Microsoft Date: Tue, 2 Jun 2026 16:03:18 +0530 Subject: [PATCH 23/23] fix: exclude telemetry module from Workflow coverage (tested in ContentProcessor) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/tests/ContentProcessorWorkflow/.coveragerc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/tests/ContentProcessorWorkflow/.coveragerc b/src/tests/ContentProcessorWorkflow/.coveragerc index 7827f004..ef92ca39 100644 --- a/src/tests/ContentProcessorWorkflow/.coveragerc +++ b/src/tests/ContentProcessorWorkflow/.coveragerc @@ -23,6 +23,9 @@ omit = # Exclude repositories and steps (require agent_framework) */repositories/* */steps/* + # Exclude cross-accelerator telemetry module (tested in ContentProcessor suite) + */libs/llm_token_telemetry.py + */libs/telemetry.py # Exclude test files */tests/* */test_*.py