-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
81 lines (59 loc) · 2.03 KB
/
utils.py
File metadata and controls
81 lines (59 loc) · 2.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import numpy as np
from scipy import stats
def k_means(data, k):
centroids = data[np.random.choice(data.shape[0], k, replace=False)]
new_centroids = np.zeros((k, 4))
clusters = [[] for _ in range(k)]
while not np.all(centroids == new_centroids):
distances = np.sqrt(
np.sum((data - centroids[:, np.newaxis]) ** 2, axis=2)
)
labels = np.argmin(distances, axis=0)
new_centroids = np.array(
[data[labels == i].mean(axis=0) for i in range(k)]
)
centroids = new_centroids
for i, label in enumerate(labels):
clusters[label].append(data[i])
return (centroids, labels + 1, clusters)
def k_means_predict(data, centroids):
distances = np.sqrt(
np.sum(np.square(data - centroids[:, np.newaxis]), axis=2)
)
labels = np.argmin(distances, axis=0)
clusters = [[] for _ in range(len(centroids))]
for i, label in enumerate(labels):
clusters[label].append(data[i])
return (distances, labels + 1, clusters)
def knn(test_data, train_data, k):
distances = np.sqrt(
np.sum(
np.square(
test_data[:, np.newaxis, :] - train_data[np.newaxis, :, :]
),
axis=2,
)
)
return np.argpartition(distances, k, axis=1)[:, :k]
def speedy_knn(df, centroids, k):
data = np.concatenate(
(df.values, np.zeros((df.values.shape[0], 1))), axis=1
)
# adding predicted labels to data
data[:, -1] = k_means_predict(data[:, :4], centroids)[1]
labels = []
for sample in data:
cluster = data[data[:, -1] == sample[-1]]
distance = np.sqrt(
np.sum(
np.square(cluster[:, :4] - sample[:4]),
axis=1,
)
)
nearest_samples = (
cluster
if len(cluster) <= k + 1
else cluster[np.argpartition(distance, k + 1)[: k + 1]]
)
labels.append(stats.mode(nearest_samples[:, -2]).mode)
return np.array(labels, dtype="int16")