autoSMART integrates with OpenAI's GPT models to provide intelligent HDD failure predictions based on SMART data analysis. This document covers the API integration, prompt engineering, and response processing.
export OPENAI_API_KEY="sk-your-openai-api-key-here"
export OPENAI_MODEL="gpt-4" # or gpt-3.5-turbo for cost optimization
export OPENAI_MAX_TOKENS=1000
export OPENAI_TEMPERATURE=0.1 # Low temperature for consistent technical analysis
-- Add OpenAI configuration to system_config
INSERT INTO system_config (key, value, description) VALUES
('openai_api_key', 'sk-your-key', 'OpenAI API key for failure predictions'),
('openai_model', 'gpt-4', 'OpenAI model to use (gpt-4, gpt-3.5-turbo)'),
('openai_max_tokens', '1000', 'Maximum tokens per API call'),
('openai_temperature', '0.1', 'Temperature setting for consistent predictions'),
('openai_timeout', '30', 'API timeout in seconds'),
('prediction_interval_hours', '24', 'Hours between AI predictions per drive');
You are an expert storage systems engineer specializing in HDD failure prediction and analysis.
Your expertise includes:
- SMART parameter interpretation across all major manufacturers (WD, Seagate, Hitachi, Toshiba)
- Statistical analysis of drive health trends and patterns
- Hardware failure mode identification and prediction
- Maintenance recommendations based on drive condition
Analyze the provided SMART data and historical trends to:
1. Assess current drive health status
2. Predict failure probability and timeline
3. Identify concerning parameter trends
4. Provide specific maintenance recommendations
Be precise, technical, and provide confidence levels for your predictions.
Return responses in structured JSON format for automated processing.
{
"task": "analyze_drive_health",
"drive_info": {
"serial_number": "WD-XXXXX",
"model": "WD4003FZEX",
"manufacturer": "Western Digital",
"capacity_gb": 4000,
"age_days": 1825,
"power_on_hours": 15000
},
"current_smart": {
"Reallocated_Sector_Ct": 0,
"Spin_Retry_Count": 0,
"Current_Pending_Sector": 1,
"Offline_Uncorrectable": 0,
"UDMA_CRC_Error_Count": 0,
"Raw_Read_Error_Rate": 158584832,
"Seek_Error_Rate": 34405355,
"Power_On_Hours": 15234,
"Load_Cycle_Count": 45123,
"Temperature_Celsius": 42,
"Start_Stop_Count": 1205,
"Power_Cycle_Count": 1198
},
"historical_trends": {
"30_day_changes": {
"Current_Pending_Sector": [0, 0, 0, 1],
"Temperature_Celsius": [38, 39, 41, 42],
"Power_On_Hours": [14950, 15050, 15150, 15234]
},
"parameter_velocities": {
"Current_Pending_Sector": 0.033,
"Temperature_Celsius": 0.133
}
}
}
{
"task": "comparative_analysis",
"drives": [
{
"serial_number": "WD-XXXXX1",
"health_score": 85,
"critical_parameters": ["Current_Pending_Sector"],
"smart_summary": {...}
},
{
"serial_number": "WD-XXXXX2",
"health_score": 92,
"critical_parameters": [],
"smart_summary": {...}
}
],
"analysis_context": {
"environment": "proxmox_cluster",
"usage_pattern": "high_io_database",
"temperature_environment": "datacenter"
}
}
{
"prediction_id": "uuid-generated",
"timestamp": "2025-08-15T10:30:00Z",
"drive_serial": "WD-XXXXX",
"analysis": {
"health_score": 78,
"risk_level": "medium",
"failure_probability": {
"7_days": 0.02,
"30_days": 0.08,
"90_days": 0.15,
"1_year": 0.35
},
"predicted_failure_date": "2026-02-15",
"confidence_level": 0.75
},
"critical_findings": [
{
"parameter": "Current_Pending_Sector",
"current_value": 1,
"trend": "increasing",
"severity": "warning",
"description": "One sector is pending reallocation - monitor closely"
},
{
"parameter": "Temperature_Celsius",
"current_value": 42,
"trend": "increasing",
"severity": "info",
"description": "Temperature trending upward but within normal range"
}
],
"recommendations": [
{
"priority": "high",
"action": "monitor_pending_sectors",
"description": "Monitor pending sector count daily - consider replacement if count increases",
"timeline": "immediate"
},
{
"priority": "medium",
"action": "improve_cooling",
"description": "Consider improving airflow to reduce operating temperature",
"timeline": "within_30_days"
}
],
"manufacturer_specific": {
"western_digital": {
"expected_lifespan_hours": 50000,
"current_usage_percent": 30.5,
"wear_level_assessment": "normal"
}
}
}
=head2 predict_failure
Generate AI-powered failure prediction for a specific drive
=cut
sub predict_failure {
my ($self, $hdd_id, $options) = @_;
# Gather drive data and historical trends
my $drive_data = $self->_gather_drive_data($hdd_id);
my $historical_data = $self->_analyze_trends($hdd_id, $options->{days} || 30);
# Construct AI prompt
my $prompt = $self->_build_analysis_prompt($drive_data, $historical_data);
# Call OpenAI API
my $prediction = $self->_call_openai_api($prompt);
# Store prediction result
$self->_store_prediction($hdd_id, $prediction);
return $prediction;
}
sub _call_openai_api {
my ($self, $prompt) = @_;
my $ua = LWP::UserAgent->new(timeout => $self->{openai_timeout} || 30);
my $request = HTTP::Request->new(POST => 'https://api.openai.com/v1/chat/completions');
$request->header('Authorization' => "Bearer $self->{openai_api_key}");
$request->header('Content-Type' => 'application/json');
my $payload = {
model => $self->{openai_model} || 'gpt-4',
messages => [
{
role => "system",
content => $self->_get_system_prompt()
},
{
role => "user",
content => encode_json($prompt)
}
],
max_tokens => $self->{openai_max_tokens} || 1000,
temperature => $self->{openai_temperature} || 0.1,
response_format => { type => "json_object" }
};
$request->content(encode_json($payload));
my $response = $ua->request($request);
if ($response->is_success) {
my $result = decode_json($response->content);
return decode_json($result->{choices}[0]{message}{content});
} else {
die "OpenAI API error: " . $response->status_line . "\n" . $response->content;
}
}
sub _call_openai_api_with_retry {
my ($self, $prompt, $max_retries) = @_;
$max_retries ||= 3;
for my $attempt (1..$max_retries) {
eval {
return $self->_call_openai_api($prompt);
};
if ($@) {
$self->_log("OpenAI API attempt $attempt failed: $@", 2);
if ($attempt < $max_retries) {
# Exponential backoff
my $delay = 2 ** $attempt;
$self->_log("Retrying in ${delay}s...", 2);
sleep($delay);
} else {
die "OpenAI API failed after $max_retries attempts: $@";
}
}
}
}
-- Enhanced predictions table
ALTER TABLE predictions ADD COLUMN api_model VARCHAR(50);
ALTER TABLE predictions ADD COLUMN api_tokens_used INTEGER;
ALTER TABLE predictions ADD COLUMN api_cost_estimate DECIMAL(10,6);
ALTER TABLE predictions ADD COLUMN confidence_level DECIMAL(3,2);
ALTER TABLE predictions ADD COLUMN failure_probability_7d DECIMAL(5,4);
ALTER TABLE predictions ADD COLUMN failure_probability_30d DECIMAL(5,4);
ALTER TABLE predictions ADD COLUMN failure_probability_90d DECIMAL(5,4);
ALTER TABLE predictions ADD COLUMN failure_probability_1y DECIMAL(5,4);
ALTER TABLE predictions ADD COLUMN predicted_failure_date DATE;
ALTER TABLE predictions ADD COLUMN recommendations JSONB;
ALTER TABLE predictions ADD COLUMN critical_findings JSONB;
=head2 get_latest_prediction
Get the most recent prediction for a drive
=cut
sub get_latest_prediction {
my ($self, $hdd_id) = @_;
my $sql = q{
SELECT p.*, hi.serial_number, hi.model_name
FROM predictions p
JOIN hdd_inventory hi ON p.hdd_id = hi.id
WHERE p.hdd_id = ?
ORDER BY p.timestamp DESC
LIMIT 1
};
my $sth = $self->{db_handle}->prepare($sql);
$sth->execute($hdd_id);
return $sth->fetchrow_hashref();
}
sub predict_multiple_drives {
my ($self, $hdd_ids, $options) = @_;
# Group drives by similarity for efficient batch processing
my $drive_groups = $self->_group_drives_by_similarity($hdd_ids);
my @predictions;
for my $group (@$drive_groups) {
if (scalar(@$group) > 1) {
# Use comparative analysis for similar drives
push @predictions, $self->_batch_comparative_analysis($group, $options);
} else {
# Use individual analysis for single drives
push @predictions, $self->predict_failure($group->[0], $options);
}
}
return @predictions;
}
sub _get_cached_prediction {
my ($self, $hdd_id, $cache_hours) = @_;
$cache_hours ||= 24;
my $sql = q{
SELECT * FROM predictions
WHERE hdd_id = ?
AND timestamp > NOW() - INTERVAL ? hour
ORDER BY timestamp DESC
LIMIT 1
};
my $sth = $self->{db_handle}->prepare($sql);
$sth->execute($hdd_id, $cache_hours);
return $sth->fetchrow_hashref();
}
sub _track_api_usage {
my ($self, $hdd_id, $tokens_used, $model) = @_;
# Estimate cost based on model pricing
my $cost_per_token = $model eq 'gpt-4' ? 0.00003 : 0.000002;
my $estimated_cost = $tokens_used * $cost_per_token;
# Log usage statistics
my $sql = q{
INSERT INTO api_usage_log
(hdd_id, timestamp, model, tokens_used, estimated_cost)
VALUES (?, NOW(), ?, ?, ?)
};
$self->{db_handle}->do($sql, undef, $hdd_id, $model, $tokens_used, $estimated_cost);
return $estimated_cost;
}
-- Track prediction accuracy over time
CREATE VIEW prediction_accuracy AS
SELECT
p.hdd_id,
p.timestamp as prediction_date,
p.failure_probability_30d,
p.predicted_failure_date,
hi.status_changed_at,
CASE
WHEN hi.status = 'failed' AND hi.status_changed_at <= p.predicted_failure_date THEN 'accurate'
WHEN hi.status = 'failed' AND hi.status_changed_at > p.predicted_failure_date THEN 'early'
WHEN hi.status = 'active' AND NOW() > p.predicted_failure_date THEN 'late'
ELSE 'pending'
END as accuracy_assessment
FROM predictions p
JOIN hdd_inventory hi ON p.hdd_id = hi.id
WHERE p.timestamp > NOW() - INTERVAL '6 months';
-- Monitor API costs and usage patterns
SELECT
DATE_TRUNC('day', timestamp) as date,
model,
COUNT(*) as api_calls,
SUM(tokens_used) as total_tokens,
SUM(estimated_cost) as daily_cost
FROM api_usage_log
WHERE timestamp > NOW() - INTERVAL '30 days'
GROUP BY DATE_TRUNC('day', timestamp), model
ORDER BY date DESC, model;
This API reference provides comprehensive guidance for integrating and optimizing OpenAI API usage within the autoSMART system. The implementation focuses on accuracy, cost-effectiveness, and reliable failure prediction capabilities.