# Project 4: DDoS Attack Detection

**Objective:** To build a high-performance machine learning model that can accurately distinguish between legitimate (Benign) network traffic and malicious DDoS attack traffic based on network flow features.

**Dataset Source:** CIC-DDoS2019 dataset from Kaggle - a modern and extensive dataset containing various up-to-date DDoS attack types.

**Model:** RandomForestClassifier - excellent for handling large feature sets and providing interpretable results for security analysis.

**Business Value:** Enables real-time threat detection, service availability protection, and automated network defense.

## 1. Setup Kaggle API and Download Dataset

In [None]:
import os

# Check if kaggle.json already exists to avoid re-uploading
if not os.path.exists('/root/.kaggle/kaggle.json'):
    print("--- Setting up Kaggle API ---")
    !pip install -q kaggle
    
    # For Google Colab - prompt user to upload their kaggle.json file
    try:
        from google.colab import files
        print("\nPlease upload your kaggle.json file:")
        uploaded = files.upload()
        if 'kaggle.json' not in uploaded:
            print("\nError: kaggle.json not uploaded.")
            exit()
        print("\nkaggle.json uploaded successfully.")
        !mkdir -p ~/.kaggle
        !cp kaggle.json ~/.kaggle/
        !chmod 600 ~/.kaggle/kaggle.json
    except ImportError:
        print("Not running in Google Colab. Please ensure Kaggle API is configured.")
        print("Place your kaggle.json in ~/.kaggle/ directory")
else:
    print("Kaggle API already configured.")

In [None]:
print("\n--- Downloading CIC-DDoS2019 Dataset from Kaggle ---")
print("‚ö†Ô∏è This is a large dataset. The download may take several minutes.")

# Download the CIC-DDoS2019 dataset
!kaggle datasets download -d frazane/cicddos2019

print("\n--- Unzipping the dataset ---")
# The dataset is composed of multiple large files
!unzip -q cicddos2019.zip -d cicddos2019
print("Dataset setup complete.")

# List the contents to understand the structure
print("\nDataset structure:")
!ls -la cicddos2019/
!ls -la cicddos2019/CSVs/ | head -10

## 2. Import Libraries and Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import warnings
warnings.filterwarnings('ignore')

# Machine Learning libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix, 
    precision_recall_curve, roc_auc_score, roc_curve
)
from sklearn.preprocessing import LabelEncoder

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully.")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

In [None]:
print("--- Loading and Preprocessing Data ---")

# The dataset is split into multiple CSVs. We'll load a representative sample
# to keep memory usage manageable while demonstrating the full pipeline
path = 'cicddos2019/CSVs'

# Load specific files that contain diverse attack types and benign traffic
filenames = [
    os.path.join(path, 'DrDoS_NTP.csv'),      # NTP reflection attack
    os.path.join(path, 'syn_and_benign.csv')  # SYN flood + benign traffic
]

# Check if files exist and load them
df_list = []
for filename in filenames:
    if os.path.exists(filename):
        print(f"Loading {filename}...")
        df_temp = pd.read_csv(filename)
        print(f"  - Shape: {df_temp.shape}")
        print(f"  - Labels: {df_temp['Label'].value_counts().to_dict()}")
        df_list.append(df_temp)
    else:
        print(f"‚ö†Ô∏è File not found: {filename}")

if len(df_list) == 0:
    print("‚ùå No data files found. Please check the dataset extraction.")
    # List available files for debugging
    print("Available files:")
    !ls cicddos2019/CSVs/
    exit()

# Combine all loaded dataframes
df = pd.concat(df_list, ignore_index=True)
print(f"\n‚úÖ Successfully loaded {len(df_list)} files.")
print(f"Combined dataset shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

## 3. Data Exploration and Analysis

In [None]:
# Initial data exploration
print("=== DATASET OVERVIEW ===")
print(f"Shape: {df.shape}")
print(f"Columns: {len(df.columns)}")

# Display basic info
print("\n=== COLUMN INFO ===")
print(df.info())

# Show first few rows
print("\n=== SAMPLE DATA ===")
print(df.head())

# Examine target variable
print("\n=== LABEL DISTRIBUTION ===")
label_counts = df['Label'].value_counts()
print(label_counts)

# Calculate class balance
total_samples = len(df)
print(f"\nClass Balance:")
for label, count in label_counts.items():
    percentage = (count / total_samples) * 100
    print(f"  {label}: {count:,} samples ({percentage:.2f}%)")

In [None]:
# Visualize label distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Bar chart of label counts
label_counts.plot(kind='bar', ax=axes[0], alpha=0.8)
axes[0].set_title('Distribution of Traffic Types', fontweight='bold', fontsize=14)
axes[0].set_xlabel('Traffic Type')
axes[0].set_ylabel('Number of Samples')
axes[0].tick_params(axis='x', rotation=45)

# Pie chart for proportions
axes[1].pie(label_counts.values, labels=label_counts.index, autopct='%1.1f%%', startangle=90)
axes[1].set_title('Proportion of Traffic Types', fontweight='bold', fontsize=14)

plt.tight_layout()
plt.show()

# Check for missing values
print("\n=== MISSING VALUES ===")
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Percentage': missing_percent
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

if len(missing_df) > 0:
    print(missing_df.head(10))
else:
    print("‚úÖ No missing values found.")

## 4. Data Preprocessing and Cleaning

In [None]:
print("=== DATA CLEANING PIPELINE ===")

# Create a copy for preprocessing
df_clean = df.copy()
print(f"Starting shape: {df_clean.shape}")

# Step 1: Clean column names (remove leading/trailing spaces)
df_clean.columns = df_clean.columns.str.strip()
print("‚úÖ Cleaned column names")

# Step 2: Remove non-predictive columns
# These columns are either identifiers or have issues in some datasets
columns_to_drop = ['Unnamed: 0', 'Flow ID', 'Source IP', 'Destination IP', 'Timestamp']

# Also check for duplicate header column (common in this dataset)
if 'Fwd Header Length.1' in df_clean.columns:
    columns_to_drop.append('Fwd Header Length.1')

# Drop columns that exist
existing_drops = [col for col in columns_to_drop if col in df_clean.columns]
df_clean = df_clean.drop(columns=existing_drops)
print(f"‚úÖ Dropped {len(existing_drops)} identifier columns: {existing_drops}")

# Step 3: Handle infinite values and NaNs
# These can occur from division-by-zero in feature calculations
print("\nHandling infinite and NaN values...")
print(f"Infinite values found: {np.isinf(df_clean.select_dtypes(include=[np.number])).sum().sum()}")

# Replace infinite values with NaN, then drop
df_clean.replace([np.inf, -np.inf], np.nan, inplace=True)
rows_before = len(df_clean)
df_clean.dropna(inplace=True)
rows_after = len(df_clean)

print(f"‚úÖ Removed {rows_before - rows_after} rows with NaN/infinite values")
print(f"Final shape after cleaning: {df_clean.shape}")

# Step 4: Encode labels
print("\nEncoding labels...")
print(f"Original labels: {df_clean['Label'].unique()}")

# Convert to binary classification: Benign = 0, Any attack = 1
df_clean['Label'] = df_clean['Label'].apply(lambda x: 0 if x == 'Benign' else 1)
print(f"Encoded labels: {df_clean['Label'].unique()}")

print("\n‚úÖ Data preprocessing complete!")
print(f"Final dataset shape: {df_clean.shape}")
print("\nFinal label distribution:")
print(df_clean['Label'].value_counts())

## 5. Feature Analysis and Selection

In [None]:
# Analyze key features that distinguish between benign and malicious traffic
print("=== FEATURE ANALYSIS ===")

# Separate features and target
feature_columns = df_clean.columns.drop('Label')
print(f"Total features available: {len(feature_columns)}")

# Sample some key features for analysis
key_features = [
    'Total Length of Fwd Packets', 'Total Length of Bwd Packets',
    'Bwd Packet Length Mean', 'Flow Duration', 'Avg Fwd Segment Size',
    'Fwd Packets/s', 'Bwd Packets/s', 'Idle Mean'
]

# Select features that exist in our dataset
available_key_features = [f for f in key_features if f in feature_columns]
print(f"Key features for analysis: {len(available_key_features)}")

if len(available_key_features) >= 4:
    # Create comparison plots
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    axes = axes.ravel()
    
    for i, feature in enumerate(available_key_features[:4]):
        # Box plot comparing benign vs malicious traffic
        df_sample = df_clean.sample(min(10000, len(df_clean)))  # Sample for visualization
        sns.boxplot(data=df_sample, x='Label', y=feature, ax=axes[i])
        axes[i].set_title(f'Distribution of {feature}', fontweight='bold')
        axes[i].set_xlabel('Traffic Type (0=Benign, 1=Attack)')
        
    plt.tight_layout()
    plt.show()
    
    # Statistical comparison
    print("\nStatistical Comparison (Benign vs Attack):")
    for feature in available_key_features[:5]:
        benign_mean = df_clean[df_clean['Label'] == 0][feature].mean()
        attack_mean = df_clean[df_clean['Label'] == 1][feature].mean()
        print(f"{feature}:")
        print(f"  Benign avg: {benign_mean:.4f}")
        print(f"  Attack avg: {attack_mean:.4f}")
        print(f"  Ratio: {attack_mean/benign_mean if benign_mean != 0 else 'inf':.2f}")
        print()

# Check for any remaining data quality issues
print("\n=== FINAL DATA QUALITY CHECK ===")
numeric_features = df_clean.select_dtypes(include=[np.number]).columns.drop('Label')
print(f"Numeric features: {len(numeric_features)}")
print(f"Any infinite values: {np.isinf(df_clean[numeric_features]).sum().sum()}")
print(f"Any NaN values: {df_clean[numeric_features].isnull().sum().sum()}")

## 6. Train-Test Split and Model Preparation

In [None]:
print("=== PREPARING DATA FOR TRAINING ===")

# Separate features (X) from the target label (y)
X = df_clean.drop(columns=['Label'])
y = df_clean['Label']

print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")
print(f"Features: {X.columns.tolist()[:10]}...")  # Show first 10 features

# Split data into training and testing sets
# stratify=y ensures balanced representation in both train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.3, 
    random_state=42, 
    stratify=y
)

print(f"\n=== TRAIN-TEST SPLIT RESULTS ===")
print(f"Training set: {X_train.shape[0]:,} samples")
print(f"Test set: {X_test.shape[0]:,} samples")
print(f"Features: {X_train.shape[1]}")

# Verify class balance is maintained
print(f"\nTraining set class distribution:")
train_dist = y_train.value_counts()
for label, count in train_dist.items():
    percentage = (count / len(y_train)) * 100
    label_name = 'Benign' if label == 0 else 'Attack'
    print(f"  {label_name} ({label}): {count:,} samples ({percentage:.2f}%)")

print(f"\nTest set class distribution:")
test_dist = y_test.value_counts()
for label, count in test_dist.items():
    percentage = (count / len(y_test)) * 100
    label_name = 'Benign' if label == 0 else 'Attack'
    print(f"  {label_name} ({label}): {count:,} samples ({percentage:.2f}%)")

# Memory usage check
print(f"\nMemory usage:")
print(f"  Training features: {X_train.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"  Test features: {X_test.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

## 7. Model Training with Random Forest

In [None]:
print("=== RANDOM FOREST TRAINING ===")

# Initialize the RandomForestClassifier with optimized parameters
# n_estimators=100: Good balance between performance and training time
# max_depth=None: Allow trees to grow deep for complex patterns
# min_samples_split=5: Prevent overfitting
# min_samples_leaf=2: Ensure sufficient samples in leaf nodes
# n_jobs=-1: Use all available CPU cores

model = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1,
    verbose=1  # Show progress
)

print("Model Configuration:")
print(f"  Algorithm: Random Forest")
print(f"  Number of trees: {model.n_estimators}")
print(f"  Max depth: {model.max_depth}")
print(f"  Min samples split: {model.min_samples_split}")
print(f"  Min samples leaf: {model.min_samples_leaf}")

print(f"\nTraining on {X_train.shape[0]:,} samples with {X_train.shape[1]} features...")
print("This may take a few minutes depending on your hardware...")

# Train the model
import time
start_time = time.time()

model.fit(X_train, y_train)

training_time = time.time() - start_time
print(f"\n‚úÖ Training completed in {training_time:.2f} seconds ({training_time/60:.2f} minutes)")

# Display model information
print(f"\n=== TRAINED MODEL INFO ===")
print(f"Number of features used: {model.n_features_in_}")
print(f"Number of classes: {len(model.classes_)}")
print(f"Classes: {model.classes_} (0=Benign, 1=Attack)")
print(f"Number of trees: {len(model.estimators_)}")

## 8. Model Evaluation and Performance Analysis

In [None]:
print("=== MODEL EVALUATION ===")

# Make predictions on the test set
print("Generating predictions...")
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probability of attack class

# Calculate basic accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nüìä Overall Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

# Detailed classification report
print("\n=== DETAILED CLASSIFICATION REPORT ===")
class_names = ['Benign (0)', 'Attack (1)']
report = classification_report(y_test, y_pred, target_names=class_names, output_dict=True)
print(classification_report(y_test, y_pred, target_names=class_names))

# Extract key metrics for security analysis
benign_precision = report['Benign (0)']['precision']
benign_recall = report['Benign (0)']['recall']
attack_precision = report['Attack (1)']['precision']
attack_recall = report['Attack (1)']['recall']

print(f"\n=== SECURITY METRICS INTERPRETATION ===")
print(f"Attack Detection Precision: {attack_precision:.4f}")
print(f"  ‚Üí When model predicts attack, it's correct {attack_precision*100:.1f}% of the time")
print(f"Attack Detection Recall: {attack_recall:.4f}")
print(f"  ‚Üí Model catches {attack_recall*100:.1f}% of actual attacks")
print(f"False Positive Rate: {1-benign_recall:.4f}")
print(f"  ‚Üí {(1-benign_recall)*100:.1f}% of benign traffic flagged as attacks")
print(f"False Negative Rate: {1-attack_recall:.4f}")
print(f"  ‚Üí {(1-attack_recall)*100:.1f}% of attacks go undetected")

# Calculate AUC-ROC for overall model performance
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"\nAUC-ROC Score: {auc_score:.4f}")
if auc_score > 0.95:
    performance_level = "Excellent"
elif auc_score > 0.90:
    performance_level = "Very Good"
elif auc_score > 0.85:
    performance_level = "Good"
else:
    performance_level = "Needs Improvement"
print(f"Performance Level: {performance_level}")

In [None]:
# Confusion Matrix Analysis
print("\n=== CONFUSION MATRIX ANALYSIS ===")

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# Calculate detailed confusion matrix metrics
tn, fp, fn, tp = cm.ravel()
print(f"\nConfusion Matrix Breakdown:")
print(f"  True Negatives (TN): {tn:,} - Correctly identified benign traffic")
print(f"  False Positives (FP): {fp:,} - Benign traffic flagged as attacks")
print(f"  False Negatives (FN): {fn:,} - Missed attacks")
print(f"  True Positives (TP): {tp:,} - Correctly detected attacks")

# Create enhanced confusion matrix visualization
plt.figure(figsize=(10, 8))

# Create annotated heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Predicted Benign', 'Predicted Attack'], 
            yticklabels=['Actual Benign', 'Actual Attack'],
            cbar_kws={'label': 'Number of Samples'})

plt.title('DDoS Attack Detection - Confusion Matrix', fontsize=16, fontweight='bold', pad=20)
plt.ylabel('Actual Class', fontsize=12)
plt.xlabel('Predicted Class', fontsize=12)

# Add percentage annotations
total = np.sum(cm)
for i in range(2):
    for j in range(2):
        percentage = cm[i, j] / total * 100
        plt.text(j + 0.5, i + 0.7, f'({percentage:.1f}%)', 
                horizontalalignment='center', fontsize=10, color='gray')

plt.tight_layout()
plt.show()

# Business impact analysis
print(f"\n=== BUSINESS IMPACT ANALYSIS ===")
total_traffic = len(y_test)
print(f"Total network traffic analyzed: {total_traffic:,} flows")
print(f"")
print(f"Security Effectiveness:")
print(f"  ‚úÖ Successfully blocked: {tp:,} attack flows ({tp/total_traffic*100:.2f}% of total)")
print(f"  ‚ùå Missed attacks: {fn:,} flows ({fn/total_traffic*100:.3f}% of total)")
print(f"  ‚ö†Ô∏è  False alarms: {fp:,} flows ({fp/total_traffic*100:.3f}% of total)")
print(f"  ‚úÖ Correctly allowed: {tn:,} benign flows ({tn/total_traffic*100:.2f}% of total)")

## 9. ROC Curve and Performance Visualization

In [None]:
# ROC Curve and Precision-Recall Curve Analysis
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
axes[0].plot(fpr, tpr, linewidth=2, label=f'ROC Curve (AUC = {auc_score:.3f})')
axes[0].plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random Classifier')
axes[0].set_xlim([0.0, 1.0])
axes[0].set_ylim([0.0, 1.05])
axes[0].set_xlabel('False Positive Rate (1 - Specificity)')
axes[0].set_ylabel('True Positive Rate (Sensitivity)')
axes[0].set_title('ROC Curve - DDoS Attack Detection', fontweight='bold')
axes[0].legend(loc="lower right")
axes[0].grid(True, alpha=0.3)

# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
avg_precision = np.mean(precision)
axes[1].plot(recall, precision, linewidth=2, label=f'PR Curve (AP = {avg_precision:.3f})')
axes[1].axhline(y=np.mean(y_test), color='k', linestyle='--', linewidth=1, 
               label=f'Random Classifier (AP = {np.mean(y_test):.3f})')
axes[1].set_xlim([0.0, 1.0])
axes[1].set_ylim([0.0, 1.05])
axes[1].set_xlabel('Recall (True Positive Rate)')
axes[1].set_ylabel('Precision')
axes[1].set_title('Precision-Recall Curve - DDoS Attack Detection', fontweight='bold')
axes[1].legend(loc="lower left")
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\n=== CURVE ANALYSIS ===")
print(f"ROC-AUC: {auc_score:.4f}")
print(f"  ‚Üí Measures overall discriminative ability")
print(f"  ‚Üí Values closer to 1.0 indicate better performance")
print(f"Average Precision: {avg_precision:.4f}")
print(f"  ‚Üí Particularly important for imbalanced datasets")
print(f"  ‚Üí Focuses on precision at different recall levels")

# Threshold analysis for operational deployment
print(f"\n=== THRESHOLD ANALYSIS FOR DEPLOYMENT ===")
thresholds = [0.3, 0.5, 0.7, 0.9]
for threshold in thresholds:
    y_pred_thresh = (y_pred_proba >= threshold).astype(int)
    cm_thresh = confusion_matrix(y_test, y_pred_thresh)
    tn_t, fp_t, fn_t, tp_t = cm_thresh.ravel()
    
    precision_t = tp_t / (tp_t + fp_t) if (tp_t + fp_t) > 0 else 0
    recall_t = tp_t / (tp_t + fn_t) if (tp_t + fn_t) > 0 else 0
    fpr_t = fp_t / (fp_t + tn_t) if (fp_t + tn_t) > 0 else 0
    
    print(f"\nThreshold {threshold}:")
    print(f"  Attack Detection Rate: {recall_t:.3f} ({recall_t*100:.1f}%)")
    print(f"  False Positive Rate: {fpr_t:.4f} ({fpr_t*100:.2f}%)")
    print(f"  Precision: {precision_t:.3f} ({precision_t*100:.1f}%)")
    
    if threshold == 0.5:
        print(f"  ‚Üê Default threshold used in evaluation")

## 10. Feature Importance Analysis

In [None]:
print("=== FEATURE IMPORTANCE ANALYSIS ===")

# Extract feature importances from the trained model
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
})

# Sort by importance
feature_importance = feature_importance.sort_values('importance', ascending=False)

print(f"\nTop 15 Most Important Features:")
print(feature_importance.head(15).to_string(index=False))

# Visualize top 15 features
plt.figure(figsize=(12, 8))
top_features = feature_importance.head(15)

# Create horizontal bar plot
bars = plt.barh(range(len(top_features)), top_features['importance'], alpha=0.8)
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Feature Importance Score', fontsize=12)
plt.title('Top 15 Features for DDoS Attack Detection', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()  # Highest importance at top

# Add value labels on bars
for i, bar in enumerate(bars):
    width = bar.get_width()
    plt.text(width + 0.001, bar.get_y() + bar.get_height()/2, 
             f'{width:.3f}', ha='left', va='center', fontsize=10)

plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

# Categorize features for network engineering insights
print(f"\n=== NETWORK ENGINEERING INSIGHTS ===")

# Group features by category
timing_features = [f for f in top_features['feature'] if any(keyword in f.lower() 
                   for keyword in ['time', 'duration', 'iat', 'idle'])]
packet_features = [f for f in top_features['feature'] if any(keyword in f.lower() 
                   for keyword in ['packet', 'length', 'size'])]
flow_features = [f for f in top_features['feature'] if any(keyword in f.lower() 
                 for keyword in ['flow', 'rate', '/s'])]
flag_features = [f for f in top_features['feature'] if any(keyword in f.lower() 
                 for keyword in ['flag', 'tcp'])]

print(f"Key Timing-based Features ({len(timing_features)}):")
for feature in timing_features[:5]:
    importance = feature_importance[feature_importance['feature'] == feature]['importance'].iloc[0]
    print(f"  ‚Ä¢ {feature}: {importance:.4f}")

print(f"\nKey Packet-based Features ({len(packet_features)}):")
for feature in packet_features[:5]:
    importance = feature_importance[feature_importance['feature'] == feature]['importance'].iloc[0]
    print(f"  ‚Ä¢ {feature}: {importance:.4f}")

print(f"\nKey Flow-based Features ({len(flow_features)}):")
for feature in flow_features[:5]:
    importance = feature_importance[feature_importance['feature'] == feature]['importance'].iloc[0]
    print(f"  ‚Ä¢ {feature}: {importance:.4f}")

# Summary insights for network operations
print(f"\n=== OPERATIONAL INSIGHTS ===")
print(f"1. TIMING PATTERNS: DDoS attacks show distinct timing signatures")
print(f"   - Monitor inter-arrival times and flow durations")
print(f"   - Abnormal timing patterns indicate potential attacks")

print(f"\n2. PACKET CHARACTERISTICS: Attack packets have different size profiles")
print(f"   - Mean packet lengths vary significantly between benign/malicious")
print(f"   - Packet size distributions are key indicators")

print(f"\n3. VOLUME METRICS: Rate-based features are crucial")
print(f"   - Packets per second and bytes per second patterns")
print(f"   - Sudden volume spikes indicate potential DDoS")

# Export feature importance for operational use
feature_importance.to_csv('ddos_feature_importance.csv', index=False)
print(f"\n‚úÖ Feature importance exported to 'ddos_feature_importance.csv'")

## 11. Model Validation and Cross-Validation

In [None]:
print("=== MODEL VALIDATION ===")

# Perform cross-validation to ensure model robustness
print("Performing 5-fold cross-validation...")
print("This provides a more robust estimate of model performance.")

# Use a smaller subset for cross-validation to manage computational time
# In production, you would use the full dataset
cv_sample_size = min(20000, len(X))
X_cv = X.sample(cv_sample_size, random_state=42)
y_cv = y.loc[X_cv.index]

print(f"Cross-validation on {cv_sample_size:,} samples...")

# Perform cross-validation
cv_scores = cross_val_score(model, X_cv, y_cv, cv=5, scoring='accuracy', n_jobs=-1)
cv_precision = cross_val_score(model, X_cv, y_cv, cv=5, scoring='precision', n_jobs=-1)
cv_recall = cross_val_score(model, X_cv, y_cv, cv=5, scoring='recall', n_jobs=-1)
cv_f1 = cross_val_score(model, X_cv, y_cv, cv=5, scoring='f1', n_jobs=-1)

print(f"\n=== CROSS-VALIDATION RESULTS ===")
print(f"Accuracy: {cv_scores.mean():.4f} ¬± {cv_scores.std():.4f}")
print(f"  Individual folds: {cv_scores}")
print(f"Precision: {cv_precision.mean():.4f} ¬± {cv_precision.std():.4f}")
print(f"Recall: {cv_recall.mean():.4f} ¬± {cv_recall.std():.4f}")
print(f"F1-Score: {cv_f1.mean():.4f} ¬± {cv_f1.std():.4f}")

# Stability analysis
if cv_scores.std() < 0.01:
    stability = "Very Stable"
elif cv_scores.std() < 0.02:
    stability = "Stable"
else:
    stability = "Moderately Stable"

print(f"\nModel Stability: {stability}")
print(f"(Standard deviation of CV scores: {cv_scores.std():.4f})")

# Performance consistency check
print(f"\n=== PERFORMANCE CONSISTENCY ===")
print(f"Minimum accuracy across folds: {cv_scores.min():.4f}")
print(f"Maximum accuracy across folds: {cv_scores.max():.4f}")
print(f"Range: {cv_scores.max() - cv_scores.min():.4f}")

if (cv_scores.max() - cv_scores.min()) < 0.02:
    consistency = "Highly Consistent"
elif (cv_scores.max() - cv_scores.min()) < 0.05:
    consistency = "Consistent"
else:
    consistency = "Moderately Consistent"

print(f"Performance Consistency: {consistency}")

## 12. Deployment Readiness Assessment

In [None]:
print("=== DEPLOYMENT READINESS ASSESSMENT ===")

# Assess model prediction speed for real-time deployment
import time

# Test prediction speed on various batch sizes
batch_sizes = [1, 10, 100, 1000]
prediction_times = {}

for batch_size in batch_sizes:
    if batch_size <= len(X_test):
        sample_data = X_test.iloc[:batch_size]
        
        # Time the prediction
        start_time = time.time()
        predictions = model.predict(sample_data)
        end_time = time.time()
        
        prediction_time = (end_time - start_time) * 1000  # Convert to milliseconds
        per_sample_time = prediction_time / batch_size
        
        prediction_times[batch_size] = {
            'total_ms': prediction_time,
            'per_sample_ms': per_sample_time,
            'samples_per_second': 1000 / per_sample_time
        }

print(f"\n=== PREDICTION SPEED ANALYSIS ===")
for batch_size, times in prediction_times.items():
    print(f"Batch size {batch_size}:")
    print(f"  Total time: {times['total_ms']:.2f} ms")
    print(f"  Per sample: {times['per_sample_ms']:.3f} ms")
    print(f"  Throughput: {times['samples_per_second']:.0f} samples/second")
    print()

# Real-time deployment assessment
single_prediction_time = prediction_times[1]['per_sample_ms']
if single_prediction_time < 1:
    deployment_readiness = "Excellent for real-time deployment"
elif single_prediction_time < 10:
    deployment_readiness = "Good for near real-time deployment"
elif single_prediction_time < 100:
    deployment_readiness = "Suitable for batch processing"
else:
    deployment_readiness = "May need optimization for production"

print(f"\n=== DEPLOYMENT RECOMMENDATION ===")
print(f"Single prediction time: {single_prediction_time:.3f} ms")
print(f"Assessment: {deployment_readiness}")

# Memory requirements
import sys
model_size = sys.getsizeof(model) / 1024**2  # Convert to MB
print(f"\nModel memory footprint: {model_size:.2f} MB")

# Feature requirements for deployment
print(f"\n=== DEPLOYMENT REQUIREMENTS ===")
print(f"‚úÖ Required features: {len(X.columns)}")
print(f"‚úÖ Preprocessing steps: Handle inf/NaN, drop identifier columns")
print(f"‚úÖ Input format: Pandas DataFrame or NumPy array")
print(f"‚úÖ Output: Binary classification (0=Benign, 1=Attack)")
print(f"‚úÖ Probability scores: Available via predict_proba()")

# Save model for deployment
import joblib
joblib.dump(model, 'ddos_detection_model.pkl')
print(f"\n‚úÖ Model saved as 'ddos_detection_model.pkl' for deployment")

# Create deployment metadata
deployment_metadata = {
    'model_type': 'RandomForestClassifier',
    'model_version': '1.0',
    'training_date': time.strftime('%Y-%m-%d %H:%M:%S'),
    'training_samples': len(X_train),
    'test_accuracy': float(accuracy),
    'test_precision': float(attack_precision),
    'test_recall': float(attack_recall),
    'auc_score': float(auc_score),
    'feature_count': len(X.columns),
    'prediction_time_ms': float(single_prediction_time),
    'required_features': X.columns.tolist()
}

import json
with open('model_metadata.json', 'w') as f:
    json.dump(deployment_metadata, f, indent=2)

print(f"‚úÖ Deployment metadata saved as 'model_metadata.json'")

## 13. Security and Business Impact Summary

In [None]:
print("\n" + "="*70)
print("        DDOS ATTACK DETECTION - FINAL SUMMARY")
print("="*70)

print("\nüõ°Ô∏è SECURITY PERFORMANCE:")
print(f"   ‚Ä¢ Overall Accuracy: {accuracy*100:.2f}%")
print(f"   ‚Ä¢ Attack Detection Rate: {attack_recall*100:.2f}% (Recall)")
print(f"   ‚Ä¢ Attack Precision: {attack_precision*100:.2f}%")
print(f"   ‚Ä¢ False Positive Rate: {(1-benign_recall)*100:.3f}%")
print(f"   ‚Ä¢ AUC-ROC Score: {auc_score:.4f} - {performance_level}")

print("\nüìä OPERATIONAL IMPACT:")
attacks_detected = tp
attacks_missed = fn
false_alarms = fp
total_attacks = tp + fn
total_benign = tn + fp

print(f"   ‚Ä¢ Attacks Successfully Blocked: {attacks_detected:,} / {total_attacks:,}")
print(f"   ‚Ä¢ Critical Attacks Missed: {attacks_missed:,}")
print(f"   ‚Ä¢ False Alarms Generated: {false_alarms:,} / {total_benign:,}")
print(f"   ‚Ä¢ Prediction Speed: {single_prediction_time:.3f} ms per flow")
print(f"   ‚Ä¢ Throughput: {prediction_times[1]['samples_per_second']:.0f} flows/second")

print("\nüéØ KEY ATTACK INDICATORS:")
top_3_features = feature_importance.head(3)
for idx, row in top_3_features.iterrows():
    print(f"   ‚Ä¢ {row['feature']}: {row['importance']:.4f} importance")

print("\nüíº BUSINESS VALUE:")
if attack_recall > 0.95 and (1-benign_recall) < 0.05:
    business_value = "HIGH - Excellent protection with minimal disruption"
elif attack_recall > 0.90 and (1-benign_recall) < 0.10:
    business_value = "GOOD - Strong protection with acceptable false positives"
else:
    business_value = "MODERATE - May need threshold tuning for optimal balance"

print(f"   ‚Ä¢ Protection Level: {business_value}")
print(f"   ‚Ä¢ Automated Defense: ‚úÖ Ready for real-time deployment")
print(f"   ‚Ä¢ Cost Reduction: ‚úÖ Prevents service outages and emergency response")
print(f"   ‚Ä¢ Compliance: ‚úÖ Meets security monitoring requirements")

print("\nüîß DEPLOYMENT RECOMMENDATIONS:")
print(f"   ‚úì Deploy with default threshold (0.5) for balanced performance")
if (1-benign_recall) > 0.02:
    print(f"   ‚ö†Ô∏è Consider higher threshold (0.7) to reduce false positives")
if attack_recall < 0.95:
    print(f"   ‚ö†Ô∏è Consider lower threshold (0.3) for maximum attack detection")
print(f"   ‚úì Implement automated blocking for high-confidence predictions (>0.8)")
print(f"   ‚úì Set up alerts for predictions between 0.5-0.8 for human review")
print(f"   ‚úì Regular model retraining recommended (monthly with new attack data)")

print("\nüìÅ GENERATED ARTIFACTS:")
print(f"   ‚Ä¢ ddos_detection_model.pkl - Trained model for deployment")
print(f"   ‚Ä¢ model_metadata.json - Deployment configuration")
print(f"   ‚Ä¢ ddos_feature_importance.csv - Feature analysis for SOC teams")

print("\n" + "="*70)
print("    MODEL READY FOR PRODUCTION DEPLOYMENT")
print(f"    Trained on {len(X_train):,} flows | Validated on {len(X_test):,} flows")
print(f"    Attack Detection: {attack_recall*100:.1f}% | False Alarms: {(1-benign_recall)*100:.2f}%")
print("="*70)

print("\nüöÄ NEXT STEPS:")
print("   1. Integrate model into network security infrastructure")
print("   2. Set up monitoring dashboard for prediction statistics")
print("   3. Implement automated response for high-confidence attacks")
print("   4. Plan regular model updates with new threat intelligence")
print("   5. Conduct A/B testing in production environment")

print("\n‚úÖ DDoS Attack Detection Model Training Complete!")