Integrating OpenAI's GPT-4 into production applications requires careful consideration of various factors including rate limiting, error handling, and cost optimization. As someone who has implemented GPT-4 in multiple production systems, I'll share practical insights and best practices.
Key Implementation Considerations
Before diving into implementation, it's crucial to understand the various aspects that need to be considered for a production-ready GPT-4 integration.
- Proper rate limiting strategies
- Error handling and fallbacks
- Token optimization techniques
- Cost management approaches
- Response streaming implementation
- Prompt engineering best practices
Basic Implementation
Let's start with a basic implementation that includes proper error handling and rate limiting.
import OpenAI from 'openai';
import { RateLimiter } from 'limiter';
export class OpenAIService {
private openai: OpenAI;
private limiter: RateLimiter;
constructor() {
this.openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY,
});
// Rate limit to 50 requests per minute
this.limiter = new RateLimiter({
tokensPerInterval: 50,
interval: 'minute'
});
}
async generateResponse(
prompt: string,
options: {
temperature?: number;
maxTokens?: number;
model?: string;
} = {}
) {
// Wait for rate limit token
await this.limiter.removeTokens(1);
try {
const completion = await this.openai.chat.completions.create({
model: options.model || 'gpt-4',
messages: [{ role: 'user', content: prompt }],
temperature: options.temperature || 0.7,
max_tokens: options.maxTokens || 150,
presence_penalty: 0.6,
frequency_penalty: 0.5
});
return {
success: true,
data: completion.choices[0].message,
usage: completion.usage
};
} catch (error) {
return this.handleOpenAIError(error);
}
}
private handleOpenAIError(error: any) {
if (error instanceof OpenAI.APIError) {
switch (error.status) {
case 429:
return {
success: false,
error: 'Rate limit exceeded',
retryAfter: error.headers['retry-after']
};
case 500:
return {
success: false,
error: 'OpenAI service error',
retryable: true
};
default:
return {
success: false,
error: error.message
};
}
}
return {
success: false,
error: 'Unknown error occurred'
};
}
}
Streaming Responses
For better user experience, you might want to implement streaming responses, especially for longer generations.
export async function* streamCompletion(
prompt: string,
options: StreamOptions = {}
) {
try {
const stream = await openai.chat.completions.create({
model: options.model || 'gpt-4',
messages: [{ role: 'user', content: prompt }],
temperature: options.temperature || 0.7,
stream: true
});
for await (const chunk of stream) {
const content = chunk.choices[0]?.delta?.content || '';
if (content) {
yield content;
}
}
} catch (error) {
console.error('Streaming error:', error);
throw error;
}
}
export async function POST(req: Request) {
const { prompt } = await req.json();
// Set up SSE headers
const encoder = new TextEncoder();
const stream = new TransformStream();
const writer = stream.writable.getWriter();
// Start streaming response
streamCompletion(prompt)
.then(async (generator) => {
try {
for await (const chunk of generator) {
await writer.write(
encoder.encode(`data: ${JSON.stringify(chunk)}\n\n`)
);
}
} finally {
writer.close();
}
})
.catch((error) => {
console.error('Streaming error:', error);
writer.close();
});
return new Response(stream.readable, {
headers: {
'Content-Type': 'text/event-stream',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive'
}
});
}
Token Management and Cost Optimization
Managing tokens and costs is crucial when working with GPT-4. Here's how to implement token counting and caching strategies.
import { encode } from 'gpt-3-encoder';
export function countTokens(text: string): number {
return encode(text).length;
}
export function truncateToTokenLimit(
text: string,
maxTokens: number
): string {
const tokens = encode(text);
if (tokens.length <= maxTokens) {
return text;
}
const truncatedTokens = tokens.slice(0, maxTokens);
return decode(truncatedTokens);
}
export class CachedOpenAIService extends OpenAIService {
private cache: Redis;
constructor() {
super();
this.cache = new Redis(process.env.REDIS_URL);
}
async generateResponse(prompt: string, options = {}) {
const cacheKey = this.generateCacheKey(prompt, options);
// Try cache first
const cached = await this.cache.get(cacheKey);
if (cached) {
return JSON.parse(cached);
}
// Generate new response
const response = await super.generateResponse(prompt, options);
// Cache successful responses
if (response.success) {
await this.cache.setex(
cacheKey,
3600, // Cache for 1 hour
JSON.stringify(response)
);
}
return response;
}
private generateCacheKey(prompt: string, options: any): string {
return `gpt4:${sha256(JSON.stringify({ prompt, options }))}`;
}
}
Monitoring and Observability
Implementing proper monitoring is essential for maintaining reliable AI-powered features in production.
export class OpenAIMetrics {
private metrics: {
requestCount: Counter;
tokenUsage: Counter;
latency: Histogram;
errorRate: Counter;
};
constructor() {
this.metrics = {
requestCount: new Counter({
name: 'openai_requests_total',
help: 'Total number of OpenAI API requests'
}),
tokenUsage: new Counter({
name: 'openai_tokens_total',
help: 'Total number of tokens used'
}),
latency: new Histogram({
name: 'openai_request_duration_seconds',
help: 'OpenAI API request duration'
}),
errorRate: new Counter({
name: 'openai_errors_total',
help: 'Total number of OpenAI API errors'
})
};
}
async trackRequest<T>(
operation: () => Promise<T>
): Promise<T> {
const startTime = process.hrtime();
try {
const result = await operation();
// Record metrics
this.metrics.requestCount.inc();
this.recordLatency(startTime);
return result;
} catch (error) {
this.metrics.errorRate.inc();
throw error;
}
}
private recordLatency(startTime: [number, number]) {
const [seconds, nanoseconds] = process.hrtime(startTime);
const duration = seconds + nanoseconds / 1e9;
this.metrics.latency.observe(duration);
}
}
Prompt Engineering Best Practices
Effective prompt engineering is crucial for getting the best results from GPT-4. Here are some key practices to follow:
- Be specific and clear in your instructions
- Provide context and examples
- Use system messages to set behavior
- Implement temperature controls based on use case
- Validate and sanitize user inputs
- Handle multi-turn conversations properly
export class PromptManager {
private systemPrompts: Record<string, string> = {
translator: 'You are a professional translator. Translate the text accurately while maintaining the original meaning and tone.',
coder: 'You are an expert programmer. Provide clean, efficient, and well-documented code solutions.',
writer: 'You are a professional writer. Create engaging content while maintaining the specified tone and style.'
};
async generatePrompt({
role,
instruction,
context,
examples = []
}: PromptConfig): Promise<string> {
const systemPrompt = this.systemPrompts[role] || '';
const examplesText = examples
.map(ex => `Example:\nInput: ${ex.input}\nOutput: ${ex.output}`)
.join('\n\n');
return [
systemPrompt,
context && `Context:\n${context}`,
examplesText,
`Instruction:\n${instruction}`
]
.filter(Boolean)
.join('\n\n');
}
validatePrompt(prompt: string): ValidationResult {
const tokenCount = countTokens(prompt);
const containsSensitiveInfo = this.checkForSensitiveInfo(prompt);
return {
isValid: tokenCount <= 4000 && !containsSensitiveInfo,
tokenCount,
containsSensitiveInfo,
errors: []
};
}
}
By implementing these patterns and best practices, you can build reliable and cost-effective AI-powered features with GPT-4. Remember to continuously monitor and optimize your implementation based on real-world usage patterns and requirements.