ML-Labs/lab5.py at main · Cairo09/ML-Labs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
# A1

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

df = pd.read_csv("iotsim-air-quality-1.csv")

X = df[["ip.ttl"]]          # single attribute
y = df["frame.len"]         # numeric target

# ===== Train-test split =====
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ===== Train model (A1 requirement) =====
reg = LinearRegression().fit(X_train, y_train)

# ===== Predict on train set =====
y_train_pred = reg.predict(X_train)

print("Coefficient:", reg.coef_[0])
print("Intercept:", reg.intercept_)
print("First 7 Train Predictions:", y_train_pred[:7])
#The intercept (81.08) suggests that for an ip.ttl value of 0, the expected packet length would be around 81 bytes.
#ip.ttl in this dataset takes only a few distinct values (mostly 61 and 64), the variation in predictions is minimal — most predicted lengths are between 75.69 and 75.86 bytes. This implies that ip.ttl alone is not a strong predictor of frame.len in this data, and more features would be needed for meaningful prediction

#A2

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
import numpy as np

df = pd.read_csv("iotsim-air-quality-1.csv")

X = df[["ip.ttl"]]          # predictor (single feature)
y = df["frame.len"]         # target (numeric)

# ===== Train-test split =====
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ===== Train model =====
reg = LinearRegression().fit(X_train, y_train)

# ===== Predictions =====
y_train_pred = reg.predict(X_train)
y_test_pred = reg.predict(X_test)

# ===== Metrics function =====
def calculate_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mse, rmse, mape, r2

# ===== Calculate metrics =====
train_mse, train_rmse, train_mape, train_r2 = calculate_metrics(y_train, y_train_pred)
test_mse, test_rmse, test_mape, test_r2 = calculate_metrics(y_test, y_test_pred)

print("=== Train Set Metrics ===")
print(f"MSE: {train_mse:.4f}")
print(f"RMSE: {train_rmse:.4f}")
print(f"MAPE: {train_mape:.4f}")
print(f"R²: {train_r2:.4f}")

print("\n=== Test Set Metrics ===")
print(f"MSE: {test_mse:.4f}")
print(f"RMSE: {test_rmse:.4f}")
print(f"MAPE: {test_mape:.4f}")
print(f"R²: {test_r2:.4f}")
#Very low R² (0.0009) → R² close to zero means the model explains almost none of the variation in frame.len, so the predictor (ip.ttl) has almost no linear relationship with the target.
#Train and test metrics are nearly identical → This shows the model isn’t overfitting or underfitting in a typical sense; it’s just weak because the feature doesn’t carry much predictive information.

#A3
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
import numpy as np

df = pd.read_csv("iotsim-air-quality-1.csv")

# Selected features that are mostly filled
features = ['ip.ttl', 'ip.proto', 'udp.srcport', 'udp.dstport']

# Drop rows with NaNs in selected features or target
df = df.dropna(subset=features + ['frame.len'])

# Defined X and y
X = df[features]
y = df['frame.len']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
reg = LinearRegression().fit(X_train, y_train)

# Predictions
y_train_pred = reg.predict(X_train)
y_test_pred = reg.predict(X_test)

# Metrics function
def get_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mse, rmse, mape, r2

# Calculate metrics
train_mse, train_rmse, train_mape, train_r2 = get_metrics(y_train, y_train_pred)
test_mse, test_rmse, test_mape, test_r2 = get_metrics(y_test, y_test_pred)

# Output results
print("=== Train Set Metrics ===")
print(f"MSE: {train_mse:.4f}")
print(f"RMSE: {train_rmse:.4f}")
print(f"MAPE: {train_mape:.4f}")
print(f"R²: {train_r2:.4f}")

print("\n=== Test Set Metrics ===")
print(f"MSE: {test_mse:.4f}")
print(f"RMSE: {test_rmse:.4f}")
print(f"MAPE: {test_mape:.4f}")
print(f"R²: {test_r2:.4f}")
#RMSE dropped from 52 in A1/A2 to 10 in A3. That’s a big decrease, showing predictions are much closer to actual values.
#R² = 0.25: Model’s predictions explain 25% of the ups and downs in frame.len

# A4
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

df = pd.read_csv("iotsim-air-quality-1.csv")

# Select numeric features (ignore target 'label')
features = df.select_dtypes(include=[np.number]).drop(columns=['label_encoded'], errors='ignore')

# Removed columns that are entirely NaN
features = features.dropna(axis=1, how='all')

# Imputed missing values with column mean
imputer = SimpleImputer(strategy='mean')
features = pd.DataFrame(imputer.fit_transform(features), columns=features.columns)

# Train-test split (same ratio as earlier)
X_train, X_test = train_test_split(features, test_size=0.2, random_state=42)

# Perform K-Means clustering (k=2)
kmeans = KMeans(n_clusters=2, random_state=0, n_init="auto").fit(X_train)

# Output results
print("Cluster labels (train):", kmeans.labels_)
print("\nCluster centers:\n", kmeans.cluster_centers_)
# The cluster centers show clear differences in several attributes (feature 4 has 47051.4163 in Cluster 0 vs. 42798.8728 in Cluster 1), indicating measurable separation in the feature space even without using the target labels.

#A5
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

df = pd.read_csv("iotsim-air-quality-1.csv")

# Select numeric features, drop target column 'label_encoded' if exists
features = df.select_dtypes(include=[np.number]).drop(columns=['label_encoded'], errors='ignore')

# Drop columns that are all NaN
features = features.dropna(axis=1, how='all')

# Impute missing values with mean
imputer = SimpleImputer(strategy='mean')
features = pd.DataFrame(imputer.fit_transform(features), columns=features.columns)

# Train-test split (same ratio as before)
X_train, X_test = train_test_split(features, test_size=0.2, random_state=42)

# Fit KMeans (k=2)
kmeans = KMeans(n_clusters=2, random_state=42, n_init="auto").fit(X_train)

# Calculate clustering metrics on training data
sil_score = silhouette_score(X_train, kmeans.labels_)
ch_score = calinski_harabasz_score(X_train, kmeans.labels_)
db_index = davies_bouldin_score(X_train, kmeans.labels_)

# Print metrics
print(f"Silhouette Score: {sil_score:f}")
print(f"Calinski–Harabasz Score: {ch_score:f}")
print(f"Davies–Bouldin Index: {db_index:f}")

#This indicates that the clusters are well separated and data points are closer to their own cluster center than to others. A score above 0.5 generally reflects meaningful clusters.
#Calinski–Harabasz Score (97,682):The high value suggests that the clusters are dense and well separated, reinforcing the silhouette score’s indication of strong clustering structure.
#Davies–Bouldin Index (0.25): A lower value indicates better separation between clusters, supporting the findings of the other two metrics.

#A6
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

# Load data
df = pd.read_csv("iotsim-air-quality-1.csv")

# Select numeric features and drop target if exists
features = df.select_dtypes(include=[np.number]).drop(columns=['label_encoded'], errors='ignore')
features = features.dropna(axis=1, how='all')

# Impute missing values with mean
imputer = SimpleImputer(strategy='mean')
features = pd.DataFrame(imputer.fit_transform(features), columns=features.columns)

# Train-test split
X_train, X_test = train_test_split(features, test_size=0.2, random_state=42)

# Check for missing or infinite values in train set
print("Missing values in X_train:", X_train.isnull().sum().sum())
print("Any infinite values in X_train:", np.isfinite(X_train).all())

# Use a smaller sample to speed up clustering
X_train_small = X_train.sample(500, random_state=42)

# Range of k values (smaller range for testing)
k_values = range(2, 6)

# Lists to store scores
silhouette_scores = []
ch_scores = []
db_indices = []

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10).fit(X_train_small)
    labels = kmeans.labels_

    sil_score = silhouette_score(X_train_small, labels)
    ch_score = calinski_harabasz_score(X_train_small, labels)
    db_index = davies_bouldin_score(X_train_small, labels)

    silhouette_scores.append(sil_score)
    ch_scores.append(ch_score)
    db_indices.append(db_index)

# Print the scores to confirm
print("Silhouette scores:", silhouette_scores)
print("Calinski-Harabasz scores:", ch_scores)
print("Davies-Bouldin indices:", db_indices)

# Plotting the scores
plt.figure(figsize=(12, 4))

plt.subplot(1, 3, 1)
plt.plot(k_values, silhouette_scores, marker='o')
plt.title('Silhouette Score vs. k')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Silhouette Score')

plt.subplot(1, 3, 2)
plt.plot(k_values, ch_scores, marker='o')
plt.title('Calinski-Harabasz Score vs. k')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Calinski-Harabasz Score')

plt.subplot(1, 3, 3)
plt.plot(k_values, db_indices, marker='o')
plt.title('Davies-Bouldin Index vs. k')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Davies-Bouldin Index')

plt.tight_layout()
plt.show()


#A7
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

distortions = []

for k in range(2, 20):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init="auto").fit(X_train)
    distortions.append(kmeans.inertia_)

plt.plot(range(2, 20), distortions, marker='o')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Distortion (Inertia)')
plt.xticks(range(2, 20))
plt.grid(True)
plt.show()
#From k = 2 → 6/7, the curve drops sharply — each new cluster significantly reduces distortion.
#After k= 6/7, the slope flattens — adding clusters beyond this gives only small improvements.