|
Bogdan Timofte
authored
3 months ago
|
1
|
# autoSMART API Reference
|
|
|
2
|
|
|
|
3
|
## 🔌 OpenAI API Integration
|
|
|
4
|
|
|
|
5
|
### Overview
|
|
|
6
|
|
|
|
7
|
autoSMART integrates with OpenAI's GPT models to provide intelligent HDD failure predictions based on SMART data analysis. This document covers the API integration, prompt engineering, and response processing.
|
|
|
8
|
|
|
|
9
|
### Configuration
|
|
|
10
|
|
|
|
11
|
#### Environment Variables
|
|
|
12
|
```bash
|
|
|
13
|
export OPENAI_API_KEY="sk-your-openai-api-key-here"
|
|
|
14
|
export OPENAI_MODEL="gpt-4" # or gpt-3.5-turbo for cost optimization
|
|
|
15
|
export OPENAI_MAX_TOKENS=1000
|
|
|
16
|
export OPENAI_TEMPERATURE=0.1 # Low temperature for consistent technical analysis
|
|
|
17
|
```
|
|
|
18
|
|
|
|
19
|
#### Database Configuration
|
|
|
20
|
```sql
|
|
|
21
|
-- Add OpenAI configuration to system_config
|
|
|
22
|
INSERT INTO system_config (key, value, description) VALUES
|
|
|
23
|
('openai_api_key', 'sk-your-key', 'OpenAI API key for failure predictions'),
|
|
|
24
|
('openai_model', 'gpt-4', 'OpenAI model to use (gpt-4, gpt-3.5-turbo)'),
|
|
|
25
|
('openai_max_tokens', '1000', 'Maximum tokens per API call'),
|
|
|
26
|
('openai_temperature', '0.1', 'Temperature setting for consistent predictions'),
|
|
|
27
|
('openai_timeout', '30', 'API timeout in seconds'),
|
|
|
28
|
('prediction_interval_hours', '24', 'Hours between AI predictions per drive');
|
|
|
29
|
```
|
|
|
30
|
|
|
|
31
|
## 🤖 AI Prediction System
|
|
|
32
|
|
|
|
33
|
### Prompt Engineering
|
|
|
34
|
|
|
|
35
|
#### System Prompt Template
|
|
|
36
|
```text
|
|
|
37
|
You are an expert storage systems engineer specializing in HDD failure prediction and analysis.
|
|
|
38
|
|
|
|
39
|
Your expertise includes:
|
|
|
40
|
- SMART parameter interpretation across all major manufacturers (WD, Seagate, Hitachi, Toshiba)
|
|
|
41
|
- Statistical analysis of drive health trends and patterns
|
|
|
42
|
- Hardware failure mode identification and prediction
|
|
|
43
|
- Maintenance recommendations based on drive condition
|
|
|
44
|
|
|
|
45
|
Analyze the provided SMART data and historical trends to:
|
|
|
46
|
1. Assess current drive health status
|
|
|
47
|
2. Predict failure probability and timeline
|
|
|
48
|
3. Identify concerning parameter trends
|
|
|
49
|
4. Provide specific maintenance recommendations
|
|
|
50
|
|
|
|
51
|
Be precise, technical, and provide confidence levels for your predictions.
|
|
|
52
|
Return responses in structured JSON format for automated processing.
|
|
|
53
|
```
|
|
|
54
|
|
|
|
55
|
#### User Prompt Templates
|
|
|
56
|
|
|
|
57
|
##### Single Drive Analysis
|
|
|
58
|
```json
|
|
|
59
|
{
|
|
|
60
|
"task": "analyze_drive_health",
|
|
|
61
|
"drive_info": {
|
|
|
62
|
"serial_number": "WD-XXXXX",
|
|
|
63
|
"model": "WD4003FZEX",
|
|
|
64
|
"manufacturer": "Western Digital",
|
|
|
65
|
"capacity_gb": 4000,
|
|
|
66
|
"age_days": 1825,
|
|
|
67
|
"power_on_hours": 15000
|
|
|
68
|
},
|
|
|
69
|
"current_smart": {
|
|
|
70
|
"Reallocated_Sector_Ct": 0,
|
|
|
71
|
"Spin_Retry_Count": 0,
|
|
|
72
|
"Current_Pending_Sector": 1,
|
|
|
73
|
"Offline_Uncorrectable": 0,
|
|
|
74
|
"UDMA_CRC_Error_Count": 0,
|
|
|
75
|
"Raw_Read_Error_Rate": 158584832,
|
|
|
76
|
"Seek_Error_Rate": 34405355,
|
|
|
77
|
"Power_On_Hours": 15234,
|
|
|
78
|
"Load_Cycle_Count": 45123,
|
|
|
79
|
"Temperature_Celsius": 42,
|
|
|
80
|
"Start_Stop_Count": 1205,
|
|
|
81
|
"Power_Cycle_Count": 1198
|
|
|
82
|
},
|
|
|
83
|
"historical_trends": {
|
|
|
84
|
"30_day_changes": {
|
|
|
85
|
"Current_Pending_Sector": [0, 0, 0, 1],
|
|
|
86
|
"Temperature_Celsius": [38, 39, 41, 42],
|
|
|
87
|
"Power_On_Hours": [14950, 15050, 15150, 15234]
|
|
|
88
|
},
|
|
|
89
|
"parameter_velocities": {
|
|
|
90
|
"Current_Pending_Sector": 0.033,
|
|
|
91
|
"Temperature_Celsius": 0.133
|
|
|
92
|
}
|
|
|
93
|
}
|
|
|
94
|
}
|
|
|
95
|
```
|
|
|
96
|
|
|
|
97
|
##### Multi-Drive Comparative Analysis
|
|
|
98
|
```json
|
|
|
99
|
{
|
|
|
100
|
"task": "comparative_analysis",
|
|
|
101
|
"drives": [
|
|
|
102
|
{
|
|
|
103
|
"serial_number": "WD-XXXXX1",
|
|
|
104
|
"health_score": 85,
|
|
|
105
|
"critical_parameters": ["Current_Pending_Sector"],
|
|
|
106
|
"smart_summary": {...}
|
|
|
107
|
},
|
|
|
108
|
{
|
|
|
109
|
"serial_number": "WD-XXXXX2",
|
|
|
110
|
"health_score": 92,
|
|
|
111
|
"critical_parameters": [],
|
|
|
112
|
"smart_summary": {...}
|
|
|
113
|
}
|
|
|
114
|
],
|
|
|
115
|
"analysis_context": {
|
|
|
116
|
"environment": "proxmox_cluster",
|
|
|
117
|
"usage_pattern": "high_io_database",
|
|
|
118
|
"temperature_environment": "datacenter"
|
|
|
119
|
}
|
|
|
120
|
}
|
|
|
121
|
```
|
|
|
122
|
|
|
|
123
|
### Response Format
|
|
|
124
|
|
|
|
125
|
#### Standard Health Assessment Response
|
|
|
126
|
```json
|
|
|
127
|
{
|
|
|
128
|
"prediction_id": "uuid-generated",
|
|
|
129
|
"timestamp": "2025-08-15T10:30:00Z",
|
|
|
130
|
"drive_serial": "WD-XXXXX",
|
|
|
131
|
"analysis": {
|
|
|
132
|
"health_score": 78,
|
|
|
133
|
"risk_level": "medium",
|
|
|
134
|
"failure_probability": {
|
|
|
135
|
"7_days": 0.02,
|
|
|
136
|
"30_days": 0.08,
|
|
|
137
|
"90_days": 0.15,
|
|
|
138
|
"1_year": 0.35
|
|
|
139
|
},
|
|
|
140
|
"predicted_failure_date": "2026-02-15",
|
|
|
141
|
"confidence_level": 0.75
|
|
|
142
|
},
|
|
|
143
|
"critical_findings": [
|
|
|
144
|
{
|
|
|
145
|
"parameter": "Current_Pending_Sector",
|
|
|
146
|
"current_value": 1,
|
|
|
147
|
"trend": "increasing",
|
|
|
148
|
"severity": "warning",
|
|
|
149
|
"description": "One sector is pending reallocation - monitor closely"
|
|
|
150
|
},
|
|
|
151
|
{
|
|
|
152
|
"parameter": "Temperature_Celsius",
|
|
|
153
|
"current_value": 42,
|
|
|
154
|
"trend": "increasing",
|
|
|
155
|
"severity": "info",
|
|
|
156
|
"description": "Temperature trending upward but within normal range"
|
|
|
157
|
}
|
|
|
158
|
],
|
|
|
159
|
"recommendations": [
|
|
|
160
|
{
|
|
|
161
|
"priority": "high",
|
|
|
162
|
"action": "monitor_pending_sectors",
|
|
|
163
|
"description": "Monitor pending sector count daily - consider replacement if count increases",
|
|
|
164
|
"timeline": "immediate"
|
|
|
165
|
},
|
|
|
166
|
{
|
|
|
167
|
"priority": "medium",
|
|
|
168
|
"action": "improve_cooling",
|
|
|
169
|
"description": "Consider improving airflow to reduce operating temperature",
|
|
|
170
|
"timeline": "within_30_days"
|
|
|
171
|
}
|
|
|
172
|
],
|
|
|
173
|
"manufacturer_specific": {
|
|
|
174
|
"western_digital": {
|
|
|
175
|
"expected_lifespan_hours": 50000,
|
|
|
176
|
"current_usage_percent": 30.5,
|
|
|
177
|
"wear_level_assessment": "normal"
|
|
|
178
|
}
|
|
|
179
|
}
|
|
|
180
|
}
|
|
|
181
|
```
|
|
|
182
|
|
|
|
183
|
## 🔧 Implementation Details
|
|
|
184
|
|
|
|
185
|
### SmartAnalyzer.pm API Integration
|
|
|
186
|
|
|
|
187
|
#### Core API Methods
|
|
|
188
|
```perl
|
|
|
189
|
=head2 predict_failure
|
|
|
190
|
|
|
|
191
|
Generate AI-powered failure prediction for a specific drive
|
|
|
192
|
|
|
|
193
|
=cut
|
|
|
194
|
|
|
|
195
|
sub predict_failure {
|
|
|
196
|
my ($self, $hdd_id, $options) = @_;
|
|
|
197
|
|
|
|
198
|
# Gather drive data and historical trends
|
|
|
199
|
my $drive_data = $self->_gather_drive_data($hdd_id);
|
|
|
200
|
my $historical_data = $self->_analyze_trends($hdd_id, $options->{days} || 30);
|
|
|
201
|
|
|
|
202
|
# Construct AI prompt
|
|
|
203
|
my $prompt = $self->_build_analysis_prompt($drive_data, $historical_data);
|
|
|
204
|
|
|
|
205
|
# Call OpenAI API
|
|
|
206
|
my $prediction = $self->_call_openai_api($prompt);
|
|
|
207
|
|
|
|
208
|
# Store prediction result
|
|
|
209
|
$self->_store_prediction($hdd_id, $prediction);
|
|
|
210
|
|
|
|
211
|
return $prediction;
|
|
|
212
|
}
|
|
|
213
|
```
|
|
|
214
|
|
|
|
215
|
#### API Request Handler
|
|
|
216
|
```perl
|
|
|
217
|
sub _call_openai_api {
|
|
|
218
|
my ($self, $prompt) = @_;
|
|
|
219
|
|
|
|
220
|
my $ua = LWP::UserAgent->new(timeout => $self->{openai_timeout} || 30);
|
|
|
221
|
|
|
|
222
|
my $request = HTTP::Request->new(POST => 'https://api.openai.com/v1/chat/completions');
|
|
|
223
|
$request->header('Authorization' => "Bearer $self->{openai_api_key}");
|
|
|
224
|
$request->header('Content-Type' => 'application/json');
|
|
|
225
|
|
|
|
226
|
my $payload = {
|
|
|
227
|
model => $self->{openai_model} || 'gpt-4',
|
|
|
228
|
messages => [
|
|
|
229
|
{
|
|
|
230
|
role => "system",
|
|
|
231
|
content => $self->_get_system_prompt()
|
|
|
232
|
},
|
|
|
233
|
{
|
|
|
234
|
role => "user",
|
|
|
235
|
content => encode_json($prompt)
|
|
|
236
|
}
|
|
|
237
|
],
|
|
|
238
|
max_tokens => $self->{openai_max_tokens} || 1000,
|
|
|
239
|
temperature => $self->{openai_temperature} || 0.1,
|
|
|
240
|
response_format => { type => "json_object" }
|
|
|
241
|
};
|
|
|
242
|
|
|
|
243
|
$request->content(encode_json($payload));
|
|
|
244
|
|
|
|
245
|
my $response = $ua->request($request);
|
|
|
246
|
|
|
|
247
|
if ($response->is_success) {
|
|
|
248
|
my $result = decode_json($response->content);
|
|
|
249
|
return decode_json($result->{choices}[0]{message}{content});
|
|
|
250
|
} else {
|
|
|
251
|
die "OpenAI API error: " . $response->status_line . "\n" . $response->content;
|
|
|
252
|
}
|
|
|
253
|
}
|
|
|
254
|
```
|
|
|
255
|
|
|
|
256
|
### Error Handling and Retry Logic
|
|
|
257
|
|
|
|
258
|
```perl
|
|
|
259
|
sub _call_openai_api_with_retry {
|
|
|
260
|
my ($self, $prompt, $max_retries) = @_;
|
|
|
261
|
$max_retries ||= 3;
|
|
|
262
|
|
|
|
263
|
for my $attempt (1..$max_retries) {
|
|
|
264
|
eval {
|
|
|
265
|
return $self->_call_openai_api($prompt);
|
|
|
266
|
};
|
|
|
267
|
|
|
|
268
|
if ($@) {
|
|
|
269
|
$self->_log("OpenAI API attempt $attempt failed: $@", 2);
|
|
|
270
|
|
|
|
271
|
if ($attempt < $max_retries) {
|
|
|
272
|
# Exponential backoff
|
|
|
273
|
my $delay = 2 ** $attempt;
|
|
|
274
|
$self->_log("Retrying in ${delay}s...", 2);
|
|
|
275
|
sleep($delay);
|
|
|
276
|
} else {
|
|
|
277
|
die "OpenAI API failed after $max_retries attempts: $@";
|
|
|
278
|
}
|
|
|
279
|
}
|
|
|
280
|
}
|
|
|
281
|
}
|
|
|
282
|
```
|
|
|
283
|
|
|
|
284
|
## 📊 Prediction Storage and Retrieval
|
|
|
285
|
|
|
|
286
|
### Database Schema for Predictions
|
|
|
287
|
```sql
|
|
|
288
|
-- Enhanced predictions table
|
|
|
289
|
ALTER TABLE predictions ADD COLUMN api_model VARCHAR(50);
|
|
|
290
|
ALTER TABLE predictions ADD COLUMN api_tokens_used INTEGER;
|
|
|
291
|
ALTER TABLE predictions ADD COLUMN api_cost_estimate DECIMAL(10,6);
|
|
|
292
|
ALTER TABLE predictions ADD COLUMN confidence_level DECIMAL(3,2);
|
|
|
293
|
ALTER TABLE predictions ADD COLUMN failure_probability_7d DECIMAL(5,4);
|
|
|
294
|
ALTER TABLE predictions ADD COLUMN failure_probability_30d DECIMAL(5,4);
|
|
|
295
|
ALTER TABLE predictions ADD COLUMN failure_probability_90d DECIMAL(5,4);
|
|
|
296
|
ALTER TABLE predictions ADD COLUMN failure_probability_1y DECIMAL(5,4);
|
|
|
297
|
ALTER TABLE predictions ADD COLUMN predicted_failure_date DATE;
|
|
|
298
|
ALTER TABLE predictions ADD COLUMN recommendations JSONB;
|
|
|
299
|
ALTER TABLE predictions ADD COLUMN critical_findings JSONB;
|
|
|
300
|
```
|
|
|
301
|
|
|
|
302
|
### Prediction Retrieval Methods
|
|
|
303
|
```perl
|
|
|
304
|
=head2 get_latest_prediction
|
|
|
305
|
|
|
|
306
|
Get the most recent prediction for a drive
|
|
|
307
|
|
|
|
308
|
=cut
|
|
|
309
|
|
|
|
310
|
sub get_latest_prediction {
|
|
|
311
|
my ($self, $hdd_id) = @_;
|
|
|
312
|
|
|
|
313
|
my $sql = q{
|
|
|
314
|
SELECT p.*, hi.serial_number, hi.model_name
|
|
|
315
|
FROM predictions p
|
|
|
316
|
JOIN hdd_inventory hi ON p.hdd_id = hi.id
|
|
|
317
|
WHERE p.hdd_id = ?
|
|
|
318
|
ORDER BY p.timestamp DESC
|
|
|
319
|
LIMIT 1
|
|
|
320
|
};
|
|
|
321
|
|
|
|
322
|
my $sth = $self->{db_handle}->prepare($sql);
|
|
|
323
|
$sth->execute($hdd_id);
|
|
|
324
|
|
|
|
325
|
return $sth->fetchrow_hashref();
|
|
|
326
|
}
|
|
|
327
|
```
|
|
|
328
|
|
|
|
329
|
## 🎯 Performance Optimization
|
|
|
330
|
|
|
|
331
|
### API Usage Optimization
|
|
|
332
|
|
|
|
333
|
#### Batch Processing
|
|
|
334
|
```perl
|
|
|
335
|
sub predict_multiple_drives {
|
|
|
336
|
my ($self, $hdd_ids, $options) = @_;
|
|
|
337
|
|
|
|
338
|
# Group drives by similarity for efficient batch processing
|
|
|
339
|
my $drive_groups = $self->_group_drives_by_similarity($hdd_ids);
|
|
|
340
|
|
|
|
341
|
my @predictions;
|
|
|
342
|
for my $group (@$drive_groups) {
|
|
|
343
|
if (scalar(@$group) > 1) {
|
|
|
344
|
# Use comparative analysis for similar drives
|
|
|
345
|
push @predictions, $self->_batch_comparative_analysis($group, $options);
|
|
|
346
|
} else {
|
|
|
347
|
# Use individual analysis for single drives
|
|
|
348
|
push @predictions, $self->predict_failure($group->[0], $options);
|
|
|
349
|
}
|
|
|
350
|
}
|
|
|
351
|
|
|
|
352
|
return @predictions;
|
|
|
353
|
}
|
|
|
354
|
```
|
|
|
355
|
|
|
|
356
|
#### Caching Strategy
|
|
|
357
|
```perl
|
|
|
358
|
sub _get_cached_prediction {
|
|
|
359
|
my ($self, $hdd_id, $cache_hours) = @_;
|
|
|
360
|
$cache_hours ||= 24;
|
|
|
361
|
|
|
|
362
|
my $sql = q{
|
|
|
363
|
SELECT * FROM predictions
|
|
|
364
|
WHERE hdd_id = ?
|
|
|
365
|
AND timestamp > NOW() - INTERVAL ? hour
|
|
|
366
|
ORDER BY timestamp DESC
|
|
|
367
|
LIMIT 1
|
|
|
368
|
};
|
|
|
369
|
|
|
|
370
|
my $sth = $self->{db_handle}->prepare($sql);
|
|
|
371
|
$sth->execute($hdd_id, $cache_hours);
|
|
|
372
|
|
|
|
373
|
return $sth->fetchrow_hashref();
|
|
|
374
|
}
|
|
|
375
|
```
|
|
|
376
|
|
|
|
377
|
### Cost Management
|
|
|
378
|
|
|
|
379
|
#### Token Usage Tracking
|
|
|
380
|
```perl
|
|
|
381
|
sub _track_api_usage {
|
|
|
382
|
my ($self, $hdd_id, $tokens_used, $model) = @_;
|
|
|
383
|
|
|
|
384
|
# Estimate cost based on model pricing
|
|
|
385
|
my $cost_per_token = $model eq 'gpt-4' ? 0.00003 : 0.000002;
|
|
|
386
|
my $estimated_cost = $tokens_used * $cost_per_token;
|
|
|
387
|
|
|
|
388
|
# Log usage statistics
|
|
|
389
|
my $sql = q{
|
|
|
390
|
INSERT INTO api_usage_log
|
|
|
391
|
(hdd_id, timestamp, model, tokens_used, estimated_cost)
|
|
|
392
|
VALUES (?, NOW(), ?, ?, ?)
|
|
|
393
|
};
|
|
|
394
|
|
|
|
395
|
$self->{db_handle}->do($sql, undef, $hdd_id, $model, $tokens_used, $estimated_cost);
|
|
|
396
|
|
|
|
397
|
return $estimated_cost;
|
|
|
398
|
}
|
|
|
399
|
```
|
|
|
400
|
|
|
|
401
|
## 📈 Analytics and Reporting
|
|
|
402
|
|
|
|
403
|
### Prediction Accuracy Tracking
|
|
|
404
|
```sql
|
|
|
405
|
-- Track prediction accuracy over time
|
|
|
406
|
CREATE VIEW prediction_accuracy AS
|
|
|
407
|
SELECT
|
|
|
408
|
p.hdd_id,
|
|
|
409
|
p.timestamp as prediction_date,
|
|
|
410
|
p.failure_probability_30d,
|
|
|
411
|
p.predicted_failure_date,
|
|
|
412
|
hi.status_changed_at,
|
|
|
413
|
CASE
|
|
|
414
|
WHEN hi.status = 'failed' AND hi.status_changed_at <= p.predicted_failure_date THEN 'accurate'
|
|
|
415
|
WHEN hi.status = 'failed' AND hi.status_changed_at > p.predicted_failure_date THEN 'early'
|
|
|
416
|
WHEN hi.status = 'active' AND NOW() > p.predicted_failure_date THEN 'late'
|
|
|
417
|
ELSE 'pending'
|
|
|
418
|
END as accuracy_assessment
|
|
|
419
|
FROM predictions p
|
|
|
420
|
JOIN hdd_inventory hi ON p.hdd_id = hi.id
|
|
|
421
|
WHERE p.timestamp > NOW() - INTERVAL '6 months';
|
|
|
422
|
```
|
|
|
423
|
|
|
|
424
|
### API Cost Analysis
|
|
|
425
|
```sql
|
|
|
426
|
-- Monitor API costs and usage patterns
|
|
|
427
|
SELECT
|
|
|
428
|
DATE_TRUNC('day', timestamp) as date,
|
|
|
429
|
model,
|
|
|
430
|
COUNT(*) as api_calls,
|
|
|
431
|
SUM(tokens_used) as total_tokens,
|
|
|
432
|
SUM(estimated_cost) as daily_cost
|
|
|
433
|
FROM api_usage_log
|
|
|
434
|
WHERE timestamp > NOW() - INTERVAL '30 days'
|
|
|
435
|
GROUP BY DATE_TRUNC('day', timestamp), model
|
|
|
436
|
ORDER BY date DESC, model;
|
|
|
437
|
```
|
|
|
438
|
|
|
|
439
|
This API reference provides comprehensive guidance for integrating and optimizing OpenAI API usage within the autoSMART system. The implementation focuses on accuracy, cost-effectiveness, and reliable failure prediction capabilities.
|