Skip to main content

Monitor Performance

Learn how to set up comprehensive monitoring for your Azure AI Foundry applications to ensure optimal performance, track usage patterns, and proactively identify issues before they impact users.

Overview

Effective monitoring involves tracking multiple layers:
  • Application performance - Response times, error rates, availability
  • AI model performance - Token usage, model latency, quality metrics
  • Infrastructure performance - CPU, memory, network, scaling events
  • Business metrics - User engagement, cost per interaction, conversion rates

Setting up Application Insights

Application Insights provides comprehensive application performance monitoring for Azure applications.

Basic setup

const appInsights = require("applicationinsights");

// Initialize Application Insights
appInsights.setup(process.env.APPLICATIONINSIGHTS_CONNECTION_STRING)
  .setAutoDependencyCorrelation(true)
  .setAutoCollectRequests(true)
  .setAutoCollectPerformance(true, true)
  .setAutoCollectExceptions(true)
  .setAutoCollectDependencies(true)
  .setAutoCollectConsole(true, true)
  .setUseDiskRetryCaching(true)
  .setSendLiveMetrics(true)
  .start();

const client = appInsights.defaultClient;

// Add custom properties to all telemetry
appInsights.defaultClient.addTelemetryProcessor((envelope) => {
  envelope.tags["ai.cloud.role"] = "myapp-api";
  envelope.tags["ai.cloud.roleInstance"] = process.env.HOSTNAME || "local";
  return true;
});

Custom telemetry for AI operations

Track specific AI-related metrics:
class AIPerformanceTracker {
  constructor(client) {
    this.client = client;
  }

  trackAICompletion(operation) {
    const { 
      model, 
      promptTokens, 
      completionTokens, 
      totalTokens, 
      duration, 
      success, 
      error,
      userId,
      conversationId 
    } = operation;

    // Track the operation as a custom event
    this.client.trackEvent({
      name: "AICompletion",
      properties: {
        model: model,
        success: success.toString(),
        error: error || "",
        userId: userId || "anonymous",
        conversationId: conversationId || ""
      },
      measurements: {
        promptTokens: promptTokens,
        completionTokens: completionTokens,
        totalTokens: totalTokens,
        duration: duration,
        costEstimate: this.calculateCost(model, totalTokens)
      }
    });

    // Track as a dependency for detailed timing
    this.client.trackDependency({
      target: "Azure AI Foundry",
      name: `${model} completion`,
      data: `${promptTokens} prompt tokens`,
      duration: duration,
      resultCode: success ? "200" : "500",
      success: success,
      dependencyTypeName: "AI Model"
    });

    // Track custom metrics
    this.client.trackMetric({
      name: "AI.TokensPerSecond",
      value: totalTokens / (duration / 1000)
    });

    this.client.trackMetric({
      name: "AI.CostPerRequest",
      value: this.calculateCost(model, totalTokens)
    });
  }

  calculateCost(model, tokens) {
    const pricing = {
      'gpt-4o': { input: 0.0025, output: 0.01 }, // per 1K tokens
      'gpt-4o-mini': { input: 0.00015, output: 0.0006 },
      'gpt-35-turbo': { input: 0.0005, output: 0.0015 }
    };

    const modelPricing = pricing[model] || pricing['gpt-4o'];
    return (tokens / 1000) * modelPricing.input; // Simplified - assumes equal input/output
  }

  trackUserFeedback(feedback) {
    this.client.trackEvent({
      name: "UserFeedback",
      properties: {
        rating: feedback.rating.toString(),
        category: feedback.category,
        conversationId: feedback.conversationId
      }
    });
  }
}

const tracker = new AIPerformanceTracker(client);

// Usage in your API endpoints
app.post('/api/chat', async (req, res) => {
  const startTime = Date.now();
  const { message, conversationId, userId } = req.body;

  try {
    const response = await aiFoundryClient.completions.create({
      messages: [{ role: 'user', content: message }],
      model: 'gpt-4o'
    });

    const duration = Date.now() - startTime;

    // Track the AI operation
    tracker.trackAICompletion({
      model: 'gpt-4o',
      promptTokens: response.usage.prompt_tokens,
      completionTokens: response.usage.completion_tokens,
      totalTokens: response.usage.total_tokens,
      duration: duration,
      success: true,
      userId: userId,
      conversationId: conversationId
    });

    res.json({
      response: response.choices[0].message.content,
      usage: response.usage
    });

  } catch (error) {
    const duration = Date.now() - startTime;

    tracker.trackAICompletion({
      model: 'gpt-4o',
      promptTokens: 0,
      completionTokens: 0,
      totalTokens: 0,
      duration: duration,
      success: false,
      error: error.message,
      userId: userId,
      conversationId: conversationId
    });

    client.trackException({ exception: error });
    res.status(500).json({ error: 'Internal server error' });
  }
});

Monitoring dashboards

Create Application Insights workbooks

Navigate to Application Insights → Workbooks → New to create custom dashboards. AI Performance Workbook JSON (simplified):
{
  "version": "Notebook/1.0",
  "items": [
    {
      "type": 1,
      "content": {
        "json": "# AI Application Performance Dashboard\n\nMonitoring AI model usage, performance, and costs."
      }
    },
    {
      "type": 3,
      "content": {
        "version": "KqlItem/1.0",
        "query": "customEvents\n| where name == \"AICompletion\"\n| where timestamp > ago(24h)\n| summarize \n    TotalRequests = count(),\n    SuccessRate = countif(tostring(customDimensions.success) == \"true\") * 100.0 / count(),\n    AvgDuration = avg(customMeasurements.duration),\n    TotalTokens = sum(customMeasurements.totalTokens),\n    TotalCost = sum(customMeasurements.costEstimate)\nby bin(timestamp, 1h)\n| render timechart",
        "size": 0,
        "title": "AI Operations Over Time"
      }
    },
    {
      "type": 3,
      "content": {
        "version": "KqlItem/1.0",
        "query": "customEvents\n| where name == \"AICompletion\"\n| where timestamp > ago(24h)\n| summarize \n    count(),\n    avg(customMeasurements.duration),\n    avg(customMeasurements.totalTokens)\nby tostring(customDimensions.model)\n| render barchart",
        "title": "Performance by Model"
      }
    }
  ]
}

Key metrics to track

Performance Metrics:
  • Average response time by model
  • 95th percentile response time
  • Request rate (requests per second)
  • Error rate and error types
  • Token processing rate
Business Metrics:
  • Cost per conversation
  • Daily/monthly active users
  • User satisfaction scores
  • Feature adoption rates
Infrastructure Metrics:
  • CPU and memory utilization
  • Auto-scaling events
  • Database query performance
  • Cache hit rates

Real-time alerting

Set up alerts in Azure Monitor

High error rate alert:
az monitor metrics alert create \
  --name "High AI Error Rate" \
  --resource-group myResourceGroup \
  --scopes "/subscriptions/{subscription-id}/resourceGroups/myResourceGroup/providers/Microsoft.Insights/components/myapp-insights" \
  --condition "count customEvents | where name == 'AICompletion' and customDimensions.success == 'false' | summarize AggregatedValue = count() by bin(TimeGenerated, 5m)" \
  --threshold 10 \
  --operator GreaterThan \
  --evaluation-frequency 5m \
  --window-size 5m \
  --severity 2 \
  --description "Alert when AI completion error rate exceeds threshold"
High cost alert:
customEvents
| where name == "AICompletion"
| where timestamp > ago(1h)
| summarize HourlyCost = sum(customMeasurements.costEstimate)
| where HourlyCost > 100 // Alert if hourly cost exceeds $100
Performance degradation alert:
customEvents
| where name == "AICompletion"
| where timestamp > ago(15m)
| summarize AvgDuration = avg(customMeasurements.duration)
| where AvgDuration > 5000 // Alert if average duration exceeds 5 seconds

Integrate with notification systems

Slack integration:
const { WebClient } = require('@slack/web-api');
const slack = new WebClient(process.env.SLACK_BOT_TOKEN);

async function sendAlertToSlack(alert) {
  const message = {
    channel: '#alerts',
    text: `🚨 *${alert.name}*`,
    blocks: [
      {
        type: "section",
        text: {
          type: "mrkdwn",
          text: `*Alert:* ${alert.name}\n*Severity:* ${alert.severity}\n*Description:* ${alert.description}`
        }
      },
      {
        type: "section",
        fields: [
          {
            type: "mrkdwn",
            text: `*Current Value:* ${alert.currentValue}`
          },
          {
            type: "mrkdwn",
            text: `*Threshold:* ${alert.threshold}`
          }
        ]
      },
      {
        type: "actions",
        elements: [
          {
            type: "button",
            text: {
              type: "plain_text",
              text: "View in Azure"
            },
            url: alert.portalUrl,
            action_id: "view_alert"
          }
        ]
      }
    ]
  };

  await slack.chat.postMessage(message);
}

// Webhook endpoint for Azure alerts
app.post('/webhooks/azure-alert', express.json(), async (req, res) => {
  const alert = req.body;
  
  if (alert.status === 'Activated') {
    await sendAlertToSlack({
      name: alert.essentials.alertRule,
      severity: alert.essentials.severity,
      description: alert.essentials.description,
      currentValue: alert.alertContext.condition.allOf[0].metricValue,
      threshold: alert.alertContext.condition.allOf[0].threshold,
      portalUrl: alert.essentials.portalLink
    });
  }

  res.status(200).send('OK');
});

Advanced monitoring techniques

Distributed tracing

Track requests across multiple services:
const { trace, context } = require('@opentelemetry/api');
const { NodeSDK } = require('@opentelemetry/sdk-node');
const { AzureMonitorTraceExporter } = require('@azure/monitor-opentelemetry-exporter');

// Initialize OpenTelemetry
const sdk = new NodeSDK({
  traceExporter: new AzureMonitorTraceExporter({
    connectionString: process.env.APPLICATIONINSIGHTS_CONNECTION_STRING
  })
});

sdk.start();

// Create spans for AI operations
app.post('/api/chat', async (req, res) => {
  const tracer = trace.getTracer('ai-chat-service');
  
  await tracer.startActiveSpan('chat-request', async (span) => {
    try {
      span.setAttributes({
        'user.id': req.body.userId,
        'conversation.id': req.body.conversationId,
        'request.model': 'gpt-4o'
      });

      // AI completion with nested span
      const response = await tracer.startActiveSpan('ai-completion', async (aiSpan) => {
        try {
          const result = await aiFoundryClient.completions.create({
            messages: [{ role: 'user', content: req.body.message }],
            model: 'gpt-4o'
          });

          aiSpan.setAttributes({
            'ai.model': 'gpt-4o',
            'ai.tokens.prompt': result.usage.prompt_tokens,
            'ai.tokens.completion': result.usage.completion_tokens,
            'ai.tokens.total': result.usage.total_tokens
          });

          return result;
        } finally {
          aiSpan.end();
        }
      });

      span.setAttributes({
        'response.tokens': response.usage.total_tokens,
        'response.success': true
      });

      res.json({ response: response.choices[0].message.content });

    } catch (error) {
      span.recordException(error);
      span.setStatus({ code: trace.SpanStatusCode.ERROR, message: error.message });
      res.status(500).json({ error: 'Internal server error' });
    } finally {
      span.end();
    }
  });
});

Custom metrics and logs

Implement structured logging:
const winston = require('winston');
const { WinstonTransport } = require('@azure/monitor-opentelemetry-exporter');

const logger = winston.createLogger({
  level: 'info',
  format: winston.format.combine(
    winston.format.timestamp(),
    winston.format.errors({ stack: true }),
    winston.format.json()
  ),
  transports: [
    new winston.transports.Console(),
    new WinstonTransport({
      connectionString: process.env.APPLICATIONINSIGHTS_CONNECTION_STRING
    })
  ]
});

// Structured logging for AI operations
function logAIOperation(operation) {
  logger.info('AI operation completed', {
    operation: 'completion',
    model: operation.model,
    duration: operation.duration,
    tokens: operation.totalTokens,
    cost: operation.cost,
    success: operation.success,
    userId: operation.userId,
    conversationId: operation.conversationId,
    // Custom dimensions for Application Insights
    customDimensions: {
      feature: 'chat',
      version: process.env.APP_VERSION,
      region: process.env.AZURE_REGION
    }
  });
}

// Usage
logAIOperation({
  model: 'gpt-4o',
  duration: 2500,
  totalTokens: 150,
  cost: 0.0375,
  success: true,
  userId: 'user123',
  conversationId: 'conv456'
});

Performance benchmarking

Implement automated performance testing:
const autocannon = require('autocannon');
const { performance } = require('perf_hooks');

class PerformanceBenchmark {
  constructor() {
    this.results = [];
  }

  async runLoadTest(url, options = {}) {
    const defaultOptions = {
      url: url,
      connections: 10,
      pipelining: 1,
      duration: 30,
      headers: {
        'content-type': 'application/json',
        'authorization': `Bearer ${process.env.TEST_API_KEY}`
      },
      body: JSON.stringify({
        message: "What is the weather like today?",
        model: "gpt-4o-mini"
      }),
      method: 'POST'
    };

    const result = await autocannon({ ...defaultOptions, ...options });
    
    this.results.push({
      timestamp: new Date().toISOString(),
      ...result
    });

    // Send results to Application Insights
    client.trackEvent({
      name: "PerformanceBenchmark",
      properties: {
        url: url,
        connections: options.connections?.toString() || "10",
        duration: options.duration?.toString() || "30"
      },
      measurements: {
        requestsPerSecond: result.requests.average,
        latencyP50: result.latency.p50,
        latencyP95: result.latency.p95,
        latencyP99: result.latency.p99,
        throughput: result.throughput.average,
        errorRate: (result.errors / result.requests.total) * 100
      }
    });

    return result;
  }

  async runSingleRequestBenchmark(endpoint, payload) {
    const startTime = performance.now();
    
    try {
      const response = await fetch(endpoint, {
        method: 'POST',
        headers: {
          'Content-Type': 'application/json',
          'Authorization': `Bearer ${process.env.TEST_API_KEY}`
        },
        body: JSON.stringify(payload)
      });

      const endTime = performance.now();
      const duration = endTime - startTime;

      const result = await response.json();

      client.trackEvent({
        name: "SingleRequestBenchmark",
        properties: {
          endpoint: endpoint,
          success: response.ok.toString(),
          statusCode: response.status.toString()
        },
        measurements: {
          duration: duration,
          tokens: result.usage?.total_tokens || 0
        }
      });

      return {
        success: response.ok,
        duration: duration,
        tokens: result.usage?.total_tokens || 0,
        response: result
      };

    } catch (error) {
      const endTime = performance.now();
      const duration = endTime - startTime;

      client.trackException({ 
        exception: error,
        properties: { context: 'SingleRequestBenchmark' }
      });

      return {
        success: false,
        duration: duration,
        error: error.message
      };
    }
  }
}

// Schedule regular performance tests
const benchmark = new PerformanceBenchmark();

setInterval(async () => {
  try {
    await benchmark.runSingleRequestBenchmark(
      'https://myapp.azurecontainerapps.io/api/chat',
      { message: "Health check message", model: "gpt-4o-mini" }
    );
  } catch (error) {
    logger.error('Performance benchmark failed', { error: error.message });
  }
}, 5 * 60 * 1000); // Every 5 minutes

Cost monitoring and optimization

Track and analyze costs

class CostAnalyzer {
  constructor() {
    this.hourlyBudget = 50; // $50 per hour limit
    this.dailyBudget = 1000; // $1000 per day limit
  }

  async analyzeCurrentSpending() {
    const now = new Date();
    const oneHourAgo = new Date(now.getTime() - 60 * 60 * 1000);
    const oneDayAgo = new Date(now.getTime() - 24 * 60 * 60 * 1000);

    // Query Application Insights for cost data
    const hourlySpending = await this.getSpendingForPeriod(oneHourAgo, now);
    const dailySpending = await this.getSpendingForPeriod(oneDayAgo, now);

    // Check against budgets
    if (hourlySpending > this.hourlyBudget) {
      await this.sendCostAlert('hourly', hourlySpending, this.hourlyBudget);
    }

    if (dailySpending > this.dailyBudget) {
      await this.sendCostAlert('daily', dailySpending, this.dailyBudget);
    }

    // Track cost metrics
    client.trackMetric({
      name: "Cost.Hourly",
      value: hourlySpending
    });

    client.trackMetric({
      name: "Cost.Daily", 
      value: dailySpending
    });

    return {
      hourly: {
        spending: hourlySpending,
        budget: this.hourlyBudget,
        utilization: (hourlySpending / this.hourlyBudget) * 100
      },
      daily: {
        spending: dailySpending,
        budget: this.dailyBudget,
        utilization: (dailySpending / this.dailyBudget) * 100
      }
    };
  }

  async getSpendingForPeriod(startTime, endTime) {
    // This would typically query your cost tracking system
    // For example purposes, calculating from AI operations
    const query = `
      customEvents
      | where name == "AICompletion"
      | where timestamp between (datetime(${startTime.toISOString()}) .. datetime(${endTime.toISOString()}))
      | summarize TotalCost = sum(customMeasurements.costEstimate)
    `;

    // Execute query against Application Insights
    // Return the total cost
    return 0; // Placeholder
  }

  async sendCostAlert(period, current, budget) {
    const alert = {
      type: 'cost_overrun',
      period: period,
      currentSpending: current,
      budget: budget,
      overrun: current - budget,
      timestamp: new Date().toISOString()
    };

    client.trackEvent({
      name: "CostAlert",
      properties: {
        period: period,
        severity: 'high'
      },
      measurements: {
        currentSpending: current,
        budget: budget,
        overrunAmount: current - budget,
        overrunPercentage: ((current - budget) / budget) * 100
      }
    });

    // Send notification (Slack, email, etc.)
    await sendAlertToSlack({
      name: `Cost Budget Exceeded - ${period}`,
      severity: 'high',
      description: `${period} spending of $${current.toFixed(2)} exceeded budget of $${budget.toFixed(2)}`,
      currentValue: current,
      threshold: budget
    });
  }
}

// Run cost analysis every 15 minutes
const costAnalyzer = new CostAnalyzer();
setInterval(async () => {
  try {
    await costAnalyzer.analyzeCurrentSpending();
  } catch (error) {
    logger.error('Cost analysis failed', { error: error.message });
  }
}, 15 * 60 * 1000);

Best practices

1. Establish baseline metrics

Track these key performance indicators from day one:
  • Response time percentiles (P50, P95, P99)
  • Error rates by error type and endpoint
  • Token usage patterns by model and user segment
  • Cost per interaction and daily spending trends

2. Implement progressive alerting

Set up alerts with different severity levels:
  • Info: 80% of budget used, response time above P95
  • Warning: 90% of budget used, error rate above 1%
  • Critical: Budget exceeded, error rate above 5%, service unavailable

3. Monitor user experience

Track metrics that directly impact users:
  • Time to first token for streaming responses
  • Conversation quality scores from user feedback
  • Feature adoption rates and usage patterns
  • User retention and engagement metrics

4. Automate responses

Implement automated responses to common issues:
  • Auto-scaling based on queue depth or response time
  • Circuit breakers for upstream service failures
  • Fallback responses when AI services are unavailable
  • Cost controls that pause expensive operations when budgets are exceeded

Common monitoring patterns

Health checks for AI services

app.get('/health/ai', async (req, res) => {
  const healthStatus = {
    status: 'healthy',
    timestamp: new Date().toISOString(),
    checks: {}
  };

  try {
    // Test AI service with a minimal request
    const startTime = Date.now();
    const testResponse = await aiFoundryClient.completions.create({
      messages: [{ role: 'user', content: 'test' }],
      model: 'gpt-4o-mini',
      max_tokens: 1
    });
    const duration = Date.now() - startTime;

    healthStatus.checks.aiService = {
      status: 'healthy',
      responseTime: duration,
      tokensUsed: testResponse.usage.total_tokens
    };

  } catch (error) {
    healthStatus.status = 'unhealthy';
    healthStatus.checks.aiService = {
      status: 'unhealthy',
      error: error.message
    };
  }

  const statusCode = healthStatus.status === 'healthy' ? 200 : 503;
  res.status(statusCode).json(healthStatus);
});

Synthetic monitoring

// Synthetic transaction monitoring
async function runSyntheticTest() {
  const testScenarios = [
    {
      name: 'Simple question',
      prompt: 'What is 2+2?',
      expectedPattern: /4|four/i,
      maxTokens: 50
    },
    {
      name: 'Complex reasoning',
      prompt: 'Explain the difference between machine learning and deep learning',
      expectedPattern: /neural network|algorithm|data/i,
      maxTokens: 200
    }
  ];

  for (const scenario of testScenarios) {
    const startTime = Date.now();
    try {
      const response = await aiFoundryClient.completions.create({
        messages: [{ role: 'user', content: scenario.prompt }],
        model: 'gpt-4o-mini',
        max_tokens: scenario.maxTokens
      });

      const duration = Date.now() - startTime;
      const content = response.choices[0].message.content;
      const isValidResponse = scenario.expectedPattern.test(content);

      client.trackEvent({
        name: "SyntheticTest",
        properties: {
          scenario: scenario.name,
          success: isValidResponse.toString(),
          model: 'gpt-4o-mini'
        },
        measurements: {
          duration: duration,
          tokens: response.usage.total_tokens,
          responseLength: content.length
        }
      });

    } catch (error) {
      const duration = Date.now() - startTime;
      
      client.trackEvent({
        name: "SyntheticTest",
        properties: {
          scenario: scenario.name,
          success: 'false',
          error: error.message
        },
        measurements: {
          duration: duration
        }
      });
    }
  }
}

// Run synthetic tests every 10 minutes
setInterval(runSyntheticTest, 10 * 60 * 1000);
By implementing comprehensive monitoring, you’ll have visibility into your application’s performance, can proactively identify issues, and optimize for both user experience and cost efficiency.

Next steps


Effective monitoring is an ongoing process. Start with basic metrics and gradually add more sophisticated monitoring as your application grows. Remember that monitoring itself has costs - balance the depth of monitoring with the value it provides.