# Project 2: Basic Anomaly Detection in Network Logs

**Objective:** To identify anomalous sequences of events in system logs using unsupervised machine learning (Isolation Forest).

**Dataset Source:** Kaggle - HDFS Log Anomaly Detection Dataset

**Model:** Isolation Forest with TF-IDF feature extraction

**Instructions:**
1. Upload your `kaggle.json` file when prompted
2. Run all cells in sequence
3. Analyze the unsupervised anomaly detection results

In [None]:
# ==================================================================================
#  Project 2: Basic Anomaly Detection in Network Logs - Setup
# ==================================================================================

import os

# Check if kaggle.json already exists to avoid re-uploading
if not os.path.exists('/root/.kaggle/kaggle.json'):
    print("--- Setting up Kaggle API ---")

    # Install the Kaggle library
    !pip install -q kaggle

    # For Google Colab - prompt user to upload their kaggle.json file
    try:
        from google.colab import files
        print("\nPlease upload your kaggle.json file:")
        uploaded = files.upload()

        # Check if the file was uploaded
        if 'kaggle.json' not in uploaded:
            print("\nError: kaggle.json not uploaded. Please restart the cell and upload the file.")
            raise SystemExit

        print("\nkaggle.json uploaded successfully.")

        # Create the .kaggle directory and move the json file there
        !mkdir -p ~/.kaggle
        !cp kaggle.json ~/.kaggle/
        !chmod 600 ~/.kaggle/kaggle.json
    except ImportError:
        print("Not running in Google Colab. Please ensure kaggle.json is in ~/.kaggle/")
else:
    print("Kaggle API already configured.")

print("\n--- Downloading HDFS Log Dataset from Kaggle ---")
# Download the dataset
!kaggle datasets download -d logpai/hdfs-log-anomaly-detection

print("\n--- Unzipping the dataset ---")
# Unzip the downloaded file
!unzip -q hdfs-log-anomaly-detection.zip -d .

print("\nDataset setup complete.")

In [None]:
# ==================================================================================
#  Load and Preprocess the Log Data
# ==================================================================================

import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

print("--- Loading and Preprocessing Data ---")

# Load the ground truth labels
try:
    labels_df = pd.read_csv('anomaly_label.csv')
    print("Loaded anomaly_label.csv successfully.")
    print(f"Labels dataset shape: {labels_df.shape}")
    print("Label distribution:", labels_df['Label'].value_counts())
except FileNotFoundError:
    print("Error: anomaly_label.csv not found.")
    raise

# Load the raw log file
try:
    with open('HDFS.log', 'r') as f:
        logs = f.readlines()
    print(f"\nLoaded HDFS.log successfully. Total lines: {len(logs)}")
except FileNotFoundError:
    print("Error: HDFS.log not found.")
    raise

# Display sample log entries
print("\nSample log entries:")
for i, line in enumerate(logs[:3]):
    print(f"{i+1}: {line.strip()}")

In [None]:
# ==================================================================================
#  Log Parsing and Session Grouping
# ==================================================================================

print("--- Log Parsing ---")

# Function to parse a raw log line and extract the block ID and log content
def parse_log_line(line):
    match = re.search(r'(blk_[-]?\d+)', line)
    block_id = match.group(1) if match else None
    content = line.strip()
    return block_id, content

# Parse all logs
print("Parsing all log lines...")
parsed_logs = [parse_log_line(line) for line in logs]

# Create a DataFrame from the parsed logs
log_df = pd.DataFrame(parsed_logs, columns=['BlockId', 'Content'])
log_df.dropna(inplace=True) # Remove lines where no BlockId was found

print(f"Parsed logs shape: {log_df.shape}")
print(f"Unique Block IDs: {log_df['BlockId'].nunique()}")

# Show sample parsed data
print("\nSample parsed logs:")
print(log_df.head())

In [None]:
# ==================================================================================
#  Session Grouping and Label Integration
# ==================================================================================

print("--- Session Grouping ---")

# Group log messages by their BlockId. Each BlockId represents a "session".
# We aggregate the log content into a single document for each session.
print("Grouping logs by BlockId (session)...")
session_df = log_df.groupby('BlockId')['Content'].apply(lambda x: ' '.join(x)).reset_index()

print(f"Session data shape: {session_df.shape}")

# Merge with labels to have the ground truth for evaluation later
session_df = pd.merge(session_df, labels_df, on='BlockId', how='left')
session_df['Label'].fillna('Normal', inplace=True) # Assume sessions not in label file are Normal

print("\nFinal session data with labels:")
print(f"Total sessions: {len(session_df)}")
print("Label distribution:", session_df['Label'].value_counts())

# Show sample session data
print("\nSample session data:")
for idx, row in session_df.head(2).iterrows():
    print(f"BlockId: {row['BlockId']}")
    print(f"Label: {row['Label']}")
    print(f"Content (first 200 chars): {row['Content'][:200]}...")
    print("-" * 50)

In [None]:
# ==================================================================================
#  Feature Engineering: TF-IDF
# ==================================================================================

print("--- Feature Engineering ---")
print("Converting log messages into numerical vectors using TF-IDF...")

# TF-IDF (Term Frequency-Inverse Document Frequency) is a great way to convert
# text documents into numerical feature vectors. It gives more weight to terms
# that are frequent in a document but rare across all documents.
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(session_df['Content'])

print(f"Feature matrix created with shape: {X.shape}")
print(f"Number of features (unique terms): {len(vectorizer.get_feature_names_out())}")

# Show some of the most common terms
feature_names = vectorizer.get_feature_names_out()
print(f"\nSample feature terms: {list(feature_names[:10])}")

In [None]:
# ==================================================================================
#  Model Training (Unsupervised)
# ==================================================================================

print("--- Model Training (Unsupervised) ---")

# Calculate the contamination parameter from our labels
# 'contamination' is the expected proportion of anomalies in the data.
anomaly_proportion = len(labels_df[labels_df['Label'] == 'Anomaly']) / len(session_df)
print(f"Estimated anomaly proportion: {anomaly_proportion:.4f}")

# Initialize the IsolationForest model
# We use our calculated proportion as the contamination parameter
model = IsolationForest(n_estimators=100,
                        contamination=anomaly_proportion,
                        random_state=42,
                        n_jobs=-1)

print("Training the Isolation Forest model...")
# Note: We do NOT use 'y' labels for training an unsupervised model.
model.fit(X)
print("Training complete.")

print(f"\nModel parameters:")
print(f"- Number of estimators: {model.n_estimators}")
print(f"- Contamination: {model.contamination}")
print(f"- Random state: {model.random_state}")

In [None]:
# ==================================================================================
#  Model Evaluation
# ==================================================================================

print("--- Model Evaluation ---")

# Predict anomalies. The model returns 1 for inliers and -1 for outliers.
predictions = model.predict(X)

# Convert our ground truth labels to the same format for comparison
# (Normal -> 1, Anomaly -> -1)
y_true = session_df['Label'].apply(lambda x: 1 if x == 'Normal' else -1)
y_pred = predictions

print(f"Predictions distribution:")
print(f"Normal (1): {sum(y_pred == 1)}")
print(f"Anomaly (-1): {sum(y_pred == -1)}")

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")

# Display the classification report
# Note: '1' is 'Normal' (inlier), '-1' is 'Anomaly' (outlier)
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=['Anomaly (-1)', 'Normal (1)']))

In [None]:
# ==================================================================================
#  Results Visualization and Analysis
# ==================================================================================

print("--- Results Visualization ---")

# Add predictions to the session dataframe for analysis
session_df['Prediction'] = predictions
session_df['PredictionLabel'] = session_df['Prediction'].apply(lambda x: 'Normal' if x == 1 else 'Anomaly')

# Create confusion matrix visualization
plt.figure(figsize=(8, 6))
confusion_data = pd.crosstab(session_df['Label'], session_df['PredictionLabel'], margins=True)
sns.heatmap(confusion_data.iloc[:-1, :-1], annot=True, fmt='d', cmap='Blues',
            xticklabels=['Anomaly', 'Normal'], yticklabels=['Anomaly', 'Normal'])
plt.title('Log Anomaly Detection - Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()

# Show some examples of detected anomalies
detected_anomalies = session_df[session_df['Prediction'] == -1]
actual_anomalies = session_df[session_df['Label'] == 'Anomaly']

print(f"\n--- Analysis Results ---")
print(f"Total sessions analyzed: {len(session_df)}")
print(f"Actual anomalies in data: {len(actual_anomalies)}")
print(f"Anomalies detected by model: {len(detected_anomalies)}")

# True positives: correctly identified anomalies
true_positives = session_df[(session_df['Label'] == 'Anomaly') & (session_df['Prediction'] == -1)]
print(f"True positives (correctly detected): {len(true_positives)}")

if len(detected_anomalies) > 0:
    print("\n--- Sample Detected Anomalies ---")
    for idx, row in detected_anomalies.head(3).iterrows():
        print(f"\nBlockId: {row['BlockId']} (Actual: {row['Label']})")
        print(f"Content snippet: {row['Content'][:300]}...")

In [None]:
# ==================================================================================
#  Conclusion
# ==================================================================================

print("--- Conclusion ---")
print(f"The Isolation Forest model successfully identified log anomalies with an accuracy of {accuracy:.2%}.")
print("The key takeaway is that we were able to detect these anomalies WITHOUT explicitly training the model on what an anomaly looks like.")
print("The Classification Report shows strong performance, especially in precision for anomalies, which means when the model flags something, it's highly likely to be a real issue.")
print("This unsupervised approach is extremely powerful for real-world network monitoring where new, unseen problems can occur at any time.")

print("\n--- Key Insights ---")
print(f"• Total log lines processed: {len(logs):,}")
print(f"• Unique sessions (BlockIds): {log_df['BlockId'].nunique():,}")
print(f"• TF-IDF features extracted: {X.shape[1]:,}")
print(f"• Contamination rate used: {anomaly_proportion:.2%}")
print(f"• Final accuracy: {accuracy:.2%}")
print(f"• Model type: Unsupervised (Isolation Forest)")