RedHatTraining · alxlenc · Nov 3, 2025 · Oct 30, 2025 · Oct 30, 2025
diff --git a/workbench/connections/dataset.csv b/workbench/connections/dataset.csv
@@ -0,0 +1,9 @@
+ customer_id,age,income,credit_score,loan_amount,employment_years,default_risk
+  1001,34,62000,720,15000,8,0
+  1002,45,89000,680,25000,12,0
+  1003,29,41000,590,8000,3,1
+  1004,52,105000,750,35000,18,0
+  1005,38,71000,640,12000,9,0
+  1006,61,95000,710,28000,22,0
+  1007,25,35000,550,5000,1,1
+  1008,48,82000,690,18000,15,0
diff --git a/intro/projects-data/projects-data-nb.ipynb → ...h/connections/workbench-connections.ipynb b/intro/projects-data/projects-data-nb.ipynb → ...h/connections/workbench-connections.ipynb
@@ -1,11 +1,73 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "9a23ec5a3a38cb88",
+   "metadata": {},
+   "source": [
+    "## Verify the connection environment variables"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "77f321c8dd7e0ce4",
+   "metadata": {},
+   "source": [
+    "1. Verify the S3 connection variables."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "662e9bf2a5f940aa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!env | grep \"AWS_\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d392c653fb27a890",
+   "metadata": {},
+   "source": [
+    "2. Verify the OCI connection variables."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4b7708fbb9db1ae8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!env | grep \"OCI_HOST=\\|ACCESS_TYPE=\" && env | grep -A7 \"dockerconfigjson=\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "22f4de0e4318188d",
+   "metadata": {},
+   "source": [
+    "3. Verify the URI connection variable."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4ff98987195d8c55",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!env | grep \"^URI=\""
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "cabab9d9-2949-46e7-808b-7bc70211164e",
    "metadata": {},
    "source": [
-    "## Connecting to S3 with Data Connections and Boto3\n",
+    "## Upload the `dataset.csv` file to S3\n",
     "\n",
     "This notebook uses the `boto3` library, which is the AWS SDK for Python.\n",
     "This library is included in some workbench images included with RHOAI, such as `Standard Data Science`.\n",
@@ -65,7 +127,7 @@
    "metadata": {},
    "source": [
     "3. View a retrieved value.\n",
-    "Note that the `key_id` matches the value provided in the form when creating the data connection.\n",
+    "Note that the `endpoint` matches the value provided in the form when creating the data connection.\n",
     "\n",
     "> WARNING: Because cell outputs are saved as part of the notebook file, be cautious when printing sensitive information to notebook output.\n",
     "If you leave sensitive credentials printed in an output cell, then you might accidentally leak this information when the notebook is committed to version control."
@@ -80,7 +142,7 @@
    },
    "outputs": [],
    "source": [
-    "key_id"
+    "endpoint"
    ]
   },
   {
@@ -106,7 +168,6 @@
     "    aws_access_key_id=key_id,\n",
     "    aws_secret_access_key=secret_key,\n",
     "    endpoint_url=endpoint,\n",
-    "    use_ssl=False\n",
     ")"
    ]
   },
@@ -142,24 +203,20 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "b25303ea-cd23-4958-b08a-c5a52beda700",
+   "id": "4f74228da1377bd7",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# create a file-like object containing bytes that represent the \"hello world\" string\n",
-    "file_obj = io.BytesIO(b\"hello world\")\n",
-    "\n",
-    "# upload the file-like object to the S3 bucket specified in the data connection\n",
-    "# the name of the \"file\" in S3 is \"hello.txt\"\n",
-    "s3.upload_fileobj(file_obj, bucket_name, Key=\"hello.txt\")"
+    "# Upload the dataset.csv file\n",
+    "s3.upload_file(\"./dataset.csv\", bucket_name, \"dataset.csv\")"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "7b407ad7-14b2-4149-9220-a7188fd3a05c",
    "metadata": {},
    "source": [
-    "6. List the contents of the bucket specified in the data connection."
+    "7. List the contents of the bucket specified in the data connection."
    ]
   },
   {
@@ -182,37 +239,7 @@
    "id": "63f252dd-3abe-4dd9-ad09-282047b6d97a",
    "metadata": {},
    "source": [
-    "> NOTE: Optionally, verify the corresponding S3 bucket for the new `hello.txt` object."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b9e8cd99-7644-4d02-849b-c30bd744ffaa",
-   "metadata": {},
-   "source": [
-    "7. Download the file from the S3 bucket to a new location."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0c1ea1b1-dc8e-4027-bc56-75a2ce8084d6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "s3.download_file(bucket_name, \"hello.txt\", \"new_hello.txt\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c1c591fb-f880-401c-b52f-e719ae83c753",
-   "metadata": {},
-   "source": [
-    "8. The pane to the left displays a new file called `new_hello.txt`.\n",
-    "Open the file and verify that its contents are `hello world`.\n",
-    "\n",
-    "> NOTE: You might need to refresh the file browser by clicking the `Refresh the file browser` button in the file browser pane.\n",
-    "The button displays as a circular arrow."
+    "> NOTE: Optionally, verify the corresponding S3 bucket for the new `dataset.csv` object."
    ]
   },
   {

diff --git a/workbench/working/.gitignore b/workbench/working/.gitignore
@@ -0,0 +1,4 @@
+# Files generated during data processing and model training
+logs/
+data_scaler.pkl
+*_data.csv
diff --git a/workbench/working/.tools/generate_dataset.py b/workbench/working/.tools/generate_dataset.py
@@ -0,0 +1,142 @@
+"""
+Generate synthetic server predictive maintenance dataset.
+This script creates a realistic dataset for predicting server failures.
+"""
+
+# /// script
+# dependencies = [
+#   "numpy",
+#   "pandas",
+# ]
+# ///
+
+import numpy as np
+import pandas as pd
+
+# Set random seed for reproducibility
+np.random.seed(42)
+
+# Number of samples (server monitoring snapshots)
+n_samples = 15000
+
+# Generate server identifiers
+server_id = np.random.choice([f'server_{i:03d}' for i in range(500)], n_samples)
+
+# Generate temporal information
+timestamp = pd.date_range('2024-01-01', periods=n_samples, freq='1H')
+
+# Generate server characteristics
+server_age_months = np.random.randint(6, 60, n_samples)
+workload_type = np.random.choice(['web', 'database', 'compute', 'storage'], n_samples)
+
+# Generate normal operating metrics (with some variation)
+cpu_temp_celsius = np.random.normal(55, 8, n_samples)
+cpu_utilization_percent = np.random.normal(45, 20, n_samples)
+memory_usage_percent = np.random.normal(60, 15, n_samples)
+disk_io_ops_per_sec = np.random.normal(500, 200, n_samples)
+network_throughput_mbps = np.random.normal(300, 100, n_samples)
+fan_speed_rpm = np.random.normal(3000, 300, n_samples)
+power_draw_watts = np.random.normal(250, 50, n_samples)
+disk_read_errors_24h = np.random.poisson(2, n_samples)
+memory_errors_24h = np.random.poisson(1, n_samples)
+
+# Create a risk score for each server based on multiple factors
+# Using much stronger, clearer patterns for >80% accuracy
+risk_score = np.zeros(n_samples)
+
+# Temperature risk (0-3 points)
+risk_score += np.where(cpu_temp_celsius > 75, 3,
+                np.where(cpu_temp_celsius > 65, 2,
+                np.where(cpu_temp_celsius > 60, 1, 0)))
+
+# Age risk (0-2 points)
+risk_score += np.where(server_age_months > 48, 2,
+                np.where(server_age_months > 36, 1, 0))
+
+# CPU utilization risk (0-2 points)
+risk_score += np.where(cpu_utilization_percent > 90, 2,
+                np.where(cpu_utilization_percent > 80, 1, 0))
+
+# Memory usage risk (0-2 points)
+risk_score += np.where(memory_usage_percent > 90, 2,
+                np.where(memory_usage_percent > 80, 1, 0))
+
+# Disk errors risk (0-3 points)
+risk_score += np.where(disk_read_errors_24h > 10, 3,
+                np.where(disk_read_errors_24h > 5, 2,
+                np.where(disk_read_errors_24h > 2, 1, 0)))
+
+# Memory errors risk (0-3 points)
+risk_score += np.where(memory_errors_24h > 5, 3,
+                np.where(memory_errors_24h > 3, 2,
+                np.where(memory_errors_24h > 1, 1, 0)))
+
+# Fan speed risk (0-1 point)
+risk_score += np.where(fan_speed_rpm < 2500, 1, 0)
+
+# Calculate failure probability based on total risk score
+# Risk score ranges from 0 to 16
+# Ultra-clear thresholds for best precision/recall with simple models
+failure_probability = np.where(
+    risk_score >= 10, 0.99,  # Very high risk: 99% failure (near certain)
+    np.where(risk_score >= 8, 0.95,  # High risk: 95% failure
+    np.where(risk_score >= 6, 0.75,  # Moderate-high risk: 75% failure
+    np.where(risk_score >= 4, 0.20,  # Low-moderate risk: 20% failure
+    np.where(risk_score >= 2, 0.03,  # Low risk: 3% failure
+    0.005))))  # Very low risk: 0.5% failure (almost never)
+)
+
+# Add critical servers with extreme conditions (near-certain failure)
+critical_indices = np.random.choice(n_samples, size=int(n_samples * 0.12), replace=False)
+cpu_temp_celsius[critical_indices] = np.random.uniform(82, 93, len(critical_indices))
+disk_read_errors_24h[critical_indices] = np.random.randint(15, 30, len(critical_indices))
+memory_errors_24h[critical_indices] = np.random.randint(8, 15, len(critical_indices))
+cpu_utilization_percent[critical_indices] = np.random.uniform(88, 99, len(critical_indices))
+memory_usage_percent[critical_indices] = np.random.uniform(88, 99, len(critical_indices))
+fan_speed_rpm[critical_indices] = np.random.uniform(1800, 2300, len(critical_indices))
+failure_probability[critical_indices] = 0.99  # Almost certain failure
+
+# Clip probabilities and metrics to realistic ranges
+failure_probability = np.clip(failure_probability, 0, 1)
+cpu_temp_celsius = np.clip(cpu_temp_celsius, 35, 95)
+cpu_utilization_percent = np.clip(cpu_utilization_percent, 5, 100)
+memory_usage_percent = np.clip(memory_usage_percent, 20, 100)
+disk_io_ops_per_sec = np.clip(disk_io_ops_per_sec, 50, 2000)
+network_throughput_mbps = np.clip(network_throughput_mbps, 10, 1000)
+fan_speed_rpm = np.clip(fan_speed_rpm, 1500, 5000)
+power_draw_watts = np.clip(power_draw_watts, 100, 500)
+
+# Generate binary failure outcome
+failure_within_48h = (np.random.random(n_samples) < failure_probability).astype(int)
+
+# Create dataframe
+df = pd.DataFrame({
+    'server_id': server_id,
+    'timestamp': timestamp,
+    'server_age_months': server_age_months,
+    'workload_type': workload_type,
+    'cpu_temp_celsius': cpu_temp_celsius.round(1),
+    'cpu_utilization_percent': cpu_utilization_percent.round(1),
+    'memory_usage_percent': memory_usage_percent.round(1),
+    'disk_io_ops_per_sec': disk_io_ops_per_sec.round(0).astype(int),
+    'network_throughput_mbps': network_throughput_mbps.round(1),
+    'fan_speed_rpm': fan_speed_rpm.round(0).astype(int),
+    'power_draw_watts': power_draw_watts.round(1),
+    'disk_read_errors_24h': disk_read_errors_24h,
+    'memory_errors_24h': memory_errors_24h,
+    'failure_within_48h': failure_within_48h
+})
+
+# Save to CSV
+df.to_csv('server_metrics.csv', index=False)
+
+print(f"Generated dataset with {n_samples} samples")
+print(f"Failure rate: {failure_within_48h.mean():.1%}")
+print(f"Number of unique servers: {df['server_id'].nunique()}")
+print(f"\nDataset shape: {df.shape}")
+print("\nFirst few rows:")
+print(df.head())
+print("\nTarget distribution:")
+print(df['failure_within_48h'].value_counts())
+print("\nWorkload distribution:")
+print(df['workload_type'].value_counts())
diff --git a/workbench/working/README.md b/workbench/working/README.md
@@ -0,0 +1,7 @@
+# Materials for `workbench-working` guided exercise
+
+**Scenario**: Server Predictive Maintenance
+
+Use these materials by following the instructions in this exercise:
+
+**GE**: https://github.com/RedHatTraining/AI0015L/blob/main/content/workbench/working/ge.adoc