Deploy to Production
Learn how to take your Azure AI Foundry applications from development to production environments with confidence. This guide covers deployment strategies, scaling considerations, monitoring setup, and production best practices.
Prerequisites
- A working Azure AI Foundry application
- Azure subscription with appropriate permissions
- Understanding of your application’s resource requirements
- Estimated time: 2-3 hours
Planning your production deployment
1. Architecture review
Before deploying, ensure your application architecture is production-ready:
Stateless design: Ensure your application can scale horizontally
// Good: Stateless API endpoint
app.post('/api/chat', async (req, res) => {
const { message, conversationId } = req.body;
// Retrieve context from external storage
const context = await getConversationContext(conversationId);
const response = await aiFoundryClient.completions.create({
messages: [...context, { role: 'user', content: message }],
model: 'gpt-4o'
});
// Store updated context
await saveConversationContext(conversationId, [...context,
{ role: 'user', content: message },
{ role: 'assistant', content: response.choices[0].message.content }
]);
res.json({ response: response.choices[0].message.content });
});
Environment separation: Use different AI Foundry projects for dev/staging/prod
# Development
export AZURE_AI_PROJECT_URL="https://myapp-dev.eastus.ai.azure.com"
# Staging
export AZURE_AI_PROJECT_URL="https://myapp-staging.eastus.ai.azure.com"
# Production
export AZURE_AI_PROJECT_URL="https://myapp-prod.eastus.ai.azure.com"
2. Resource planning
Calculate your expected usage:
Token estimation: For a chat application
// Estimate monthly token usage
const estimatedDailyUsers = 1000;
const averageMessagesPerUser = 10;
const averageTokensPerMessage = 150; // input + output
const monthlyTokens = estimatedDailyUsers * averageMessagesPerUser * averageTokensPerMessage * 30;
console.log(`Estimated monthly tokens: ${monthlyTokens.toLocaleString()}`);
// Estimated monthly tokens: 45,000,000
Model selection for scale:
- High-volume, simple tasks: GPT-4o mini or Llama models
- Complex reasoning: GPT-4o for critical paths only
- Mixed workload: Route different request types to appropriate models
Deployment strategies
Option 1: Azure Container Apps (Recommended)
Best for scalable web applications and APIs.
Step 1: Containerize your application
Create a production Dockerfile:
FROM node:18-alpine
WORKDIR /app
# Copy package files
COPY package*.json ./
RUN npm ci --only=production
# Copy application code
COPY . .
# Create non-root user
RUN addgroup -g 1001 -S nodejs
RUN adduser -S nextjs -u 1001
USER nextjs
EXPOSE 3000
CMD ["node", "server.js"]
Step 2: Build and push to Azure Container Registry
# Create container registry
az acr create --resource-group myResourceGroup --name myappregistry --sku Basic
# Build and push image
az acr build --registry myappregistry --image myapp:latest .
Step 3: Deploy to Container Apps
# Create Container Apps environment
az containerapp env create \
--name myapp-env \
--resource-group myResourceGroup \
--location eastus
# Deploy the application
az containerapp create \
--name myapp \
--resource-group myResourceGroup \
--environment myapp-env \
--image myappregistry.azurecr.io/myapp:latest \
--target-port 3000 \
--ingress external \
--min-replicas 1 \
--max-replicas 10 \
--cpu 0.5 \
--memory 1Gi \
--env-vars AZURE_AI_PROJECT_URL=secretref:ai-project-url
Option 2: Azure App Service
Good for traditional web applications.
# Create App Service plan
az appservice plan create \
--name myapp-plan \
--resource-group myResourceGroup \
--sku P1V3 \
--is-linux
# Create web app
az webapp create \
--resource-group myResourceGroup \
--plan myapp-plan \
--name myapp-prod \
--deployment-container-image-name myappregistry.azurecr.io/myapp:latest
# Configure app settings
az webapp config appsettings set \
--resource-group myResourceGroup \
--name myapp-prod \
--settings AZURE_AI_PROJECT_URL="https://myapp-prod.eastus.ai.azure.com"
Option 3: Azure Kubernetes Service (AKS)
For complex applications requiring orchestration.
deployment.yaml:
apiVersion: apps/v1
kind: Deployment
metadata:
name: myapp
spec:
replicas: 3
selector:
matchLabels:
app: myapp
template:
metadata:
labels:
app: myapp
spec:
containers:
- name: myapp
image: myappregistry.azurecr.io/myapp:latest
ports:
- containerPort: 3000
env:
- name: AZURE_AI_PROJECT_URL
valueFrom:
secretKeyRef:
name: ai-foundry-config
key: project-url
resources:
requests:
memory: "512Mi"
cpu: "250m"
limits:
memory: "1Gi"
cpu: "500m"
---
apiVersion: v1
kind: Service
metadata:
name: myapp-service
spec:
selector:
app: myapp
ports:
- protocol: TCP
port: 80
targetPort: 3000
type: LoadBalancer
Configuration management
Environment variables and secrets
Never hardcode credentials in your application:
// Good: Use environment variables
const config = {
aiFoundryUrl: process.env.AZURE_AI_PROJECT_URL,
aiFoundryKey: process.env.AZURE_AI_API_KEY,
databaseUrl: process.env.DATABASE_URL,
redisUrl: process.env.REDIS_URL
};
// Validate required environment variables
const requiredEnvVars = ['AZURE_AI_PROJECT_URL', 'AZURE_AI_API_KEY'];
for (const envVar of requiredEnvVars) {
if (!process.env[envVar]) {
throw new Error(`Missing required environment variable: ${envVar}`);
}
}
Azure Key Vault integration
const { SecretClient } = require("@azure/keyvault-secrets");
const { DefaultAzureCredential } = require("@azure/identity");
const credential = new DefaultAzureCredential();
const vaultName = "myapp-keyvault";
const url = `https://${vaultName}.vault.azure.net`;
const client = new SecretClient(url, credential);
async function getSecret(secretName) {
try {
const secret = await client.getSecret(secretName);
return secret.value;
} catch (error) {
console.error(`Failed to retrieve secret ${secretName}:`, error);
throw error;
}
}
// Usage
const aiFoundryKey = await getSecret("azure-ai-api-key");
Auto-scaling configuration
For Container Apps:
az containerapp update \
--name myapp \
--resource-group myResourceGroup \
--min-replicas 2 \
--max-replicas 20 \
--scale-rule-name "http-rule" \
--scale-rule-type "http" \
--scale-rule-http-concurrent-requests 100
For App Service:
az monitor autoscale create \
--resource-group myResourceGroup \
--resource myapp-prod \
--resource-type Microsoft.Web/sites \
--name myapp-autoscale \
--min-count 2 \
--max-count 10 \
--count 2
# Add CPU-based scaling rule
az monitor autoscale rule create \
--resource-group myResourceGroup \
--autoscale-name myapp-autoscale \
--condition "Percentage CPU > 70 avg 5m" \
--scale out 2
Caching strategies
Implement caching to reduce AI model calls:
const redis = require('redis');
const client = redis.createClient({
url: process.env.REDIS_URL
});
async function getCachedResponse(prompt, model) {
const cacheKey = `ai:${model}:${crypto.createHash('md5').update(prompt).digest('hex')}`;
try {
const cached = await client.get(cacheKey);
if (cached) {
return JSON.parse(cached);
}
} catch (error) {
console.warn('Cache retrieval error:', error);
}
return null;
}
async function setCachedResponse(prompt, model, response, ttlSeconds = 3600) {
const cacheKey = `ai:${model}:${crypto.createHash('md5').update(prompt).digest('hex')}`;
try {
await client.setEx(cacheKey, ttlSeconds, JSON.stringify(response));
} catch (error) {
console.warn('Cache storage error:', error);
}
}
// Usage in your API endpoint
app.post('/api/chat', async (req, res) => {
const { message, model = 'gpt-4o' } = req.body;
// Check cache first
let response = await getCachedResponse(message, model);
if (!response) {
// Make AI call
response = await aiFoundryClient.completions.create({
messages: [{ role: 'user', content: message }],
model: model
});
// Cache the response
await setCachedResponse(message, model, response);
}
res.json(response);
});
Monitoring and observability
Application Insights integration
const appInsights = require("applicationinsights");
appInsights.setup(process.env.APPLICATIONINSIGHTS_CONNECTION_STRING)
.setAutoDependencyCorrelation(true)
.setAutoCollectRequests(true)
.setAutoCollectPerformance(true)
.setAutoCollectExceptions(true)
.setAutoCollectDependencies(true)
.setAutoCollectConsole(true)
.setUseDiskRetryCaching(true)
.start();
const client = appInsights.defaultClient;
// Custom telemetry for AI operations
async function trackAIOperation(operation, duration, success, tokenCount) {
client.trackEvent({
name: "AIOperation",
properties: {
operation: operation,
success: success.toString(),
tokenCount: tokenCount.toString()
},
measurements: {
duration: duration,
tokenCount: tokenCount
}
});
}
// Usage
const startTime = Date.now();
try {
const response = await aiFoundryClient.completions.create({
messages: [{ role: 'user', content: message }],
model: 'gpt-4o'
});
const duration = Date.now() - startTime;
await trackAIOperation('completion', duration, true, response.usage.total_tokens);
} catch (error) {
const duration = Date.now() - startTime;
await trackAIOperation('completion', duration, false, 0);
throw error;
}
Health checks
Implement comprehensive health checks:
app.get('/health', async (req, res) => {
const healthCheck = {
uptime: process.uptime(),
timestamp: Date.now(),
status: 'OK',
checks: {}
};
try {
// Check database connectivity
await checkDatabase();
healthCheck.checks.database = 'OK';
} catch (error) {
healthCheck.checks.database = 'FAIL';
healthCheck.status = 'DEGRADED';
}
try {
// Check AI Foundry connectivity
await checkAIFoundry();
healthCheck.checks.aiFoundry = 'OK';
} catch (error) {
healthCheck.checks.aiFoundry = 'FAIL';
healthCheck.status = 'DEGRADED';
}
try {
// Check Redis connectivity
await checkRedis();
healthCheck.checks.redis = 'OK';
} catch (error) {
healthCheck.checks.redis = 'FAIL';
healthCheck.status = 'DEGRADED';
}
const statusCode = healthCheck.status === 'OK' ? 200 : 503;
res.status(statusCode).json(healthCheck);
});
async function checkAIFoundry() {
const response = await aiFoundryClient.completions.create({
messages: [{ role: 'user', content: 'test' }],
model: 'gpt-4o',
max_tokens: 1
});
return response.choices.length > 0;
}
Security hardening
Authentication and authorization
const jwt = require('jsonwebtoken');
const rateLimit = require('express-rate-limit');
// Rate limiting
const limiter = rateLimit({
windowMs: 15 * 60 * 1000, // 15 minutes
max: 100, // limit each IP to 100 requests per windowMs
message: 'Too many requests from this IP'
});
app.use('/api/', limiter);
// JWT middleware
function authenticateToken(req, res, next) {
const authHeader = req.headers['authorization'];
const token = authHeader && authHeader.split(' ')[1];
if (!token) {
return res.sendStatus(401);
}
jwt.verify(token, process.env.JWT_SECRET, (err, user) => {
if (err) return res.sendStatus(403);
req.user = user;
next();
});
}
// Apply authentication to protected routes
app.use('/api/chat', authenticateToken);
const { body, validationResult } = require('express-validator');
app.post('/api/chat',
[
body('message')
.isLength({ min: 1, max: 4000 })
.trim()
.escape(),
body('model')
.optional()
.isIn(['gpt-4o', 'gpt-4o-mini', 'gpt-35-turbo'])
],
async (req, res) => {
const errors = validationResult(req);
if (!errors.isEmpty()) {
return res.status(400).json({ errors: errors.array() });
}
// Process the validated input
const { message, model = 'gpt-4o' } = req.body;
// ... rest of the handler
}
);
Deployment automation
CI/CD with GitHub Actions
.github/workflows/deploy.yml:
name: Deploy to Production
on:
push:
branches: [ main ]
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Setup Node.js
uses: actions/setup-node@v3
with:
node-version: '18'
- name: Install dependencies
run: npm ci
- name: Run tests
run: npm test
- name: Run linting
run: npm run lint
deploy:
needs: test
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Login to Azure
uses: azure/login@v1
with:
creds: ${{ secrets.AZURE_CREDENTIALS }}
- name: Build and push to ACR
run: |
az acr build --registry myappregistry --image myapp:${{ github.sha }} .
- name: Deploy to Container Apps
run: |
az containerapp update \
--name myapp \
--resource-group myResourceGroup \
--image myappregistry.azurecr.io/myapp:${{ github.sha }}
Production checklist
Before going live, verify:
Security
Monitoring
Reliability
Compliance
Troubleshooting common issues
High latency
// Add request timeout and retry logic
const axios = require('axios');
const aiClient = axios.create({
baseURL: process.env.AZURE_AI_PROJECT_URL,
timeout: 30000,
headers: {
'Authorization': `Bearer ${process.env.AZURE_AI_API_KEY}`
}
});
// Add retry logic with exponential backoff
const retryConfig = {
retries: 3,
retryDelay: (retryCount) => {
return Math.pow(2, retryCount) * 1000; // 1s, 2s, 4s
},
retryCondition: (error) => {
return error.response?.status >= 500 || error.code === 'ECONNABORTED';
}
};
axiosRetry(aiClient, retryConfig);
Memory leaks
// Implement proper cleanup
process.on('SIGTERM', async () => {
console.log('SIGTERM received, shutting down gracefully');
// Close database connections
await database.close();
// Close Redis connections
await redis.quit();
// Stop accepting new requests
server.close(() => {
console.log('Process terminated');
process.exit(0);
});
});
Cost optimization
// Implement smart model routing based on complexity
function selectOptimalModel(prompt) {
const complexity = analyzePromptComplexity(prompt);
if (complexity.score < 0.3) {
return 'gpt-4o-mini'; // Fast and cheap for simple queries
} else if (complexity.score < 0.7) {
return 'gpt-4o'; // Balanced for moderate complexity
} else {
return 'gpt-4o'; // Best quality for complex tasks
}
}
function analyzePromptComplexity(prompt) {
const indicators = {
length: prompt.length > 500,
multiStep: /step|then|after|next|following/.test(prompt.toLowerCase()),
reasoning: /analyze|compare|evaluate|explain why/.test(prompt.toLowerCase()),
creative: /write|create|generate|imagine/.test(prompt.toLowerCase())
};
const score = Object.values(indicators).filter(Boolean).length / Object.keys(indicators).length;
return { score, indicators };
}
Next steps
Your application is now running in production! Consider these advanced topics:
Production deployments require careful planning and testing. Always use staging environments to validate changes before deploying to production. Monitor your application closely after deployment and be prepared to roll back if issues arise.