ML-Labs/lab7.py at main · Cairo09/ML-Labs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

#A2
# Load the dataset
df = pd.read_csv('iotsim-air-quality-1.csv')

# Drop columns that are non-numeric or have unique identifiers not useful for general modeling
df_processed = df.drop(columns=['frame.time', 'eth.src', 'eth.dst', 'ip.dst', 'ip.src'])

# ---Convert categorical columns ---
if 'frame.protocols' in df_processed.columns:
    protocols_dummies = df_processed['frame.protocols'].str.get_dummies(sep=':')
    df_processed = pd.concat([df_processed.drop('frame.protocols', axis=1), protocols_dummies], axis=1)


# Label Encode the target variable 'label'
le = LabelEncoder()
df_processed['label'] = le.fit_transform(df_processed['label'])
print("Target variable 'label' encoded.")

# Separate features (X) and target (y)
X = df_processed.drop('label', axis=1)
y = df_processed['label']

# Ensure all feature columns are numeric
for col in X.columns:
    X[col] = pd.to_numeric(X[col], errors='coerce')

# --- Drop columns that contain only missing values ---
X = X.dropna(axis=1, how='all')
print("Dropped columns with all missing values.")

# --- Handle remaining missing values with mean ---
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
X = pd.DataFrame(X_imputed, columns=X.columns)
print("Missing values handled using mean imputation.")

# --- Data Splitting ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
print(f"Data split into training ({X_train.shape[0]} samples) and testing ({X_test.shape[0]} samples) sets.")


# --- Hyperparameter Tuning with RandomizedSearchCV ---
print("\n--- Tuning Hyperparameters for RandomForestClassifier ---")
# Cross-check: The output will show the best combination of parameters found by the search.

# Define the parameter distribution to search over
param_dist = {
    'n_estimators': [50, 100, 150],  #Controls: The number of trees in the forest. 50 is a good baseline to see if the model can work with a relatively small number of trees.100 is a very common and effective default.150 tests if adding more trees provides a meaningful benefit.
#Could test 500 or 1000. However, the performance benefit from adding more trees has less adv. The jump in accuracy from 50 to 100 trees is usually much larger than from 450 to 500

    'max_depth': [10, 20, 30, None],  #The maximum depth (number of decision splits) each tree can have. 10 represents a relatively shallow tree, forcing a simpler model.20 and 30 represent progressively more complex models that can learn very detailed patterns.
#None allows the tree to grow as deep as possible, representing the most complex model it can be. This is the best way to check for the maximum potential performance (and the highest risk of overfitting)

    'min_samples_leaf': [1, 2, 4]
    #1 is the default. It means a tree can create a special rule for a single, potentially noisy data point.2 and 4 force the model to be more general. Any rule it creates must apply to at least 2 or 4 samples. This makes the model smoother and more robust, as discovered in lab4.py KNN experiment where a higher 'k' smoothed the decision boundary.
}

# Initialize the RandomForest model and RandomizedSearchCV
rf = RandomForestClassifier(random_state=42)
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=5,  # Perform 5 iterations of random search for simplicity
    cv=3,       # Use 3-fold cross-validation
    verbose=1,
    random_state=42,
    n_jobs=-1   # Use all available cores
)

# Fit the random search model to find the best hyperparameters
random_search.fit(X_train, y_train)


print("\nBest parameters found by RandomizedSearchCV:")
print(random_search.best_params_)
print("\nHyperparameter tuning is complete.")
#{'n_estimators': 50, 'min_samples_leaf': 2, 'max_depth': None}
#adding more trees (like 100 or 150) mostly didn't provide a significant enough improvement to be worth the extra computational cost
#The default value is 1, which can sometimes lead to the model "memorizing" the training data (overfitting). By choosing 2, the search determined that a slightly more generalized model is better. It forces every decision rule (at a leaf node) to be based on at least two data points, making the model more robust to noise.
# The fact that the search preferred this over a restricted depth (like 10 or 30) indicates that the patterns in the IoT data are complex and require very specific, deep rules to accurately separate the different classes


#A3
import pandas as pd
import warnings

# Import preprocessing tools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Import all required classifiers
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
import catboost as ctb

# Import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# --- Step 1: Data Preprocessing ---
print("--- Starting Data Preprocessing ---")

# Load the dataset
try:
    df = pd.read_csv('iotsim-air-quality-1.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: 'iotsim-air-quality-1.csv' not found. Please ensure the file is in the correct directory.")
    exit()

# Drop columns that are not useful for general modeling
df_processed = df.drop(columns=['frame.time', 'eth.src', 'eth.dst', 'ip.dst', 'ip.src'])

# One-hot encode the 'frame.protocols' column
if 'frame.protocols' in df_processed.columns:
    try:
        protocols_dummies = df_processed['frame.protocols'].str.get_dummies(sep=':')
        df_processed = pd.concat([df_processed.drop('frame.protocols', axis=1), protocols_dummies], axis=1)
        print("One-hot encoded 'frame.protocols'.")
    except Exception as e:
        print(f"Could not one-hot encode protocols: {e}")

# Label Encode the target variable 'label'
le = LabelEncoder()
df_processed['label'] = le.fit_transform(df_processed['label'])
print("Target variable 'label' encoded.")

# Separate features (X) and target (y)
X = df_processed.drop('label', axis=1)
y = df_processed['label']

# Ensure all feature columns are numeric, coercing errors
for col in X.columns:
    X[col] = pd.to_numeric(X[col], errors='coerce')

# Drop columns that contain only missing values before imputation
X = X.dropna(axis=1, how='all')

# Handle any remaining missing values using mean imputation
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
X = pd.DataFrame(X_imputed, columns=X.columns)
print("Missing values handled.")

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
print(f"Data split into training ({X_train.shape[0]} samples) and testing ({X_test.shape[0]} samples) sets.")
print("-" * 20)


# --- Step 2: Employ, Evaluate, and Tabulate Classifiers ---

# Define the dictionary of classifiers to be used
classifiers = {
    "Support Vector Machine": SVC(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(random_state=42),
    "CatBoost": ctb.CatBoostClassifier(random_state=42, verbose=0),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "XGBoost": xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss'),
    "Naïve-Bayes": GaussianNB(),
    "MLP": MLPClassifier(random_state=42, max_iter=500)
}

# Create an empty list to store results
results_list = []

# Loop through each classifier
for name, clf in classifiers.items():
    print(f"Training {name}...")

    # Train the model
    clf.fit(X_train, y_train)

    # Make predictions on both train and test sets
    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)

    # ---- Train Metrics ----
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred, average='weighted', zero_division=0)
    train_recall = recall_score(y_train, y_train_pred, average='weighted', zero_division=0)
    train_f1 = f1_score(y_train, y_train_pred, average='weighted', zero_division=0)

    # ---- Test Metrics ----
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred, average='weighted', zero_division=0)
    test_recall = recall_score(y_test, y_test_pred, average='weighted', zero_division=0)
    test_f1 = f1_score(y_test, y_test_pred, average='weighted', zero_division=0)

    # Append results to the list
    results_list.append({
        "Classifier": name,
        "Train Accuracy": train_accuracy,
        "Test Accuracy": test_accuracy,
        "Train Precision": train_precision,
        "Test Precision": test_precision,
        "Train Recall": train_recall,
        "Test Recall": test_recall,
        "Train F1-Score": train_f1,
        "Test F1-Score": test_f1
    })

# Convert the list of results into a DataFrame for tabulation
results_df = pd.DataFrame(results_list)

# --- Step 3: Display Tabulated Results and Observations ---

print("\n--- Classifier Performance Comparison ---")
print(results_df.to_string())

#suggests that the patterns in your data are complex and non-linear, requiring the detailed, rule-based approach that decision trees excel at.
#The Decision Tree is the classic example of overfitting here. It achieved a near-perfect Train Accuracy of 0.997690 but a slightly lower Test Accuracy of 0.994006. It essentially "memorized" the training data perfectly. The RandomForest also shows this, but to a greater extent, with a Train Accuracy of 0.997690 and a Test Accuracy of 0.971482. While still a very strong model, the gap indicates it learned the training data slightly too well.
#This poor performance indicates that the boundaries separating benign and malicious traffic in your dataset are not simple or linear. Models that assume simpler relationships cannot capture the complexity as well as tree ensembles.
#Hence The low Test Precision for SVM (0.379) and AdaBoost (0.384)


#A5- Actually project is classification probelm, but did regression in previous labs, so to understand structure of dataset, doing clustering.
import pandas as pd
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage

# Import preprocessing tools
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

print("--- A5: Hierarchical Clustering ---")


df = pd.read_csv('iotsim-air-quality-1.csv')


# Drop columns that are not useful for general modeling and the label (since here we are doing clustering not classification)
X = df.drop(columns=['label', 'frame.time', 'eth.src', 'eth.dst', 'ip.dst', 'ip.src'])

# One-hot encode the 'frame.protocols' column
if 'frame.protocols' in X.columns:
    protocols_dummies = X['frame.protocols'].str.get_dummies(sep=':')
    X = pd.concat([X.drop('frame.protocols', axis=1), protocols_dummies], axis=1)
    print("One-hot encoded 'frame.protocols'.")

# Ensure all feature columns are numeric
for col in X.columns:
    X[col] = pd.to_numeric(X[col], errors='coerce')

# Drop columns that contain only missing values before imputation
X = X.dropna(axis=1, how='all')

# Handle any remaining missing values using mean
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
X = pd.DataFrame(X_imputed, columns=X.columns)
print("Missing values handled.")

# used a sample of the data for clustering
X_sample = X.sample(n=1000, random_state=42)

# Feature Scaling is crucial for distance-based clustering
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_sample)
print("Data sampled and scaled.")

# --- Step 2: Perform Hierarchical Clustering ---
# The linkage function computes the distance between clusters. 'ward' minimizes the variance of the clusters being merged.
print("Performing Hierarchical Clustering...")
linked = linkage(X_scaled, method='ward')

# --- Step 3: Plot the Dendrogram ---
# The dendrogram helps visualize the nested structure and decide on the number of clusters.
print("Plotting Dendrogram...")
plt.figure(figsize=(12, 7))
dendrogram(linked,
           orientation='top',
           distance_sort='descending',
           show_leaf_counts=True,
           truncate_mode='lastp',  # Show only the last 'p' merged clusters for clarity
           p=20
           )
plt.title('Hierarchical Clustering Dendrogram (Truncated)')
plt.xlabel('Cluster Size')
plt.ylabel('Distance (Ward)')
plt.suptitle('Hierarchical Clustering')
plt.show()

#Y measures how dissimilar or "far apart" the clusters are. A longer vertical line means that the two clusters being merged were very distinct from each other.
#Inference: presence of several long vertical lines.They indicate that the clusters being merged were far apart, meaning the groups are naturally well-separated and distinct.
# cutting across the longest vertical lines would suggest that the data could be naturally divided into 2, 3, or perhaps 4 primary clusters.
#But Actually there are 5 labels: Some of the Attack Labels Classes Look Very Similar