-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathClustering.py
More file actions
61 lines (50 loc) · 2.8 KB
/
Copy pathClustering.py
File metadata and controls
61 lines (50 loc) · 2.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
import time
import numpy as np
from itertools import combinations
from Functions import clustering, cluster_closeness_matrix, \
load_review_vectors, assign_clusters, p_correct_clusters
# knn is number of neighbours to use for assigning a label to a new cluster, k is number of clusters
knn, k = 9, 5
n_train, n_test = 1000, 100
cluster_types = ['K-means', 'Minibatch-Kmeans'] # , 'Gaussian-Mixture' , 'Agglomerative', 'Birch', 'Spectral']
filename_list = ['train_fasttext', 'train_minhash', 'test_fasttext', 'test_minhash']
path_list = [os.path.join(os.getcwd(), f'{name}_vectors.csv') for name in filename_list]
# Loading Training Data
t = time.time()
ft_train_v, ft_train_r = load_review_vectors(path_list[0], no_reviews=n_train)
mh_train_v, mh_train_r = load_review_vectors(path_list[1], no_reviews=n_train)
print(f'Loading training data took: {time.time() - t}')
print(f'Shape of training data:\nft: {ft_train_v.shape}\nmh: {mh_train_v.shape}')
# Loading Test data
t = time.time()
ft_test_v, ft_test_r = load_review_vectors(path_list[2], no_reviews=n_test)
mh_test_v, mh_test_r = load_review_vectors(path_list[3], no_reviews=n_test)
print(f'Loading test data took: {time.time() - t}')
print(f'Shape of test data:\nft: {ft_test_v.shape}\nmh: {mh_test_v.shape}')
proportion_correct, cluster_assignments, cc_mats, models, weights = [], [], [], [], []
for i, name in enumerate(cluster_types):
# Run the clustering
t = time.time()
labels_ft, model_ft = clustering(ft_train_v, method=name)
print(f'{name} took: {time.time() - t} seconds on fasttext')
labels_mh, model_mh = clustering(mh_train_v, method=name)
print(f'{name} took: {time.time() - t} seconds on minhash')
t = time.time()
# Proportion of each class in the clusters, each row is a cluster column is star rating
m1, w1 = cluster_closeness_matrix(ft_train_r, labels_ft, decimals=4)
m2, w2 = cluster_closeness_matrix(mh_train_r, labels_mh, decimals=4)
# Using the maximum proportions assign each cluster a star rating, creates a dict:
label_map_ft, label_map_mh = assign_clusters(m1, w1), assign_clusters(m2, w2)
# Use predict method and compare to the assigned clusters
correct_proportion_ft = p_correct_clusters(ft_test_r, ft_test_v, label_map_ft, model=model_ft)
correct_proportion_mh = p_correct_clusters(mh_test_r, mh_test_v, label_map_mh, model=model_mh)
print(f'Calculating the correct proportion took {time.time()-t} seconds')
# Append all desired data to corresponding lists
cluster_assignments.append((label_map_ft, label_map_mh))
cc_mats.append((m1, m2))
models.append((model_ft, model_mh))
weights.append((w1, w2))
proportion_correct.append((correct_proportion_ft, correct_proportion_mh))
print(proportion_correct)
print(f'The proportion of each class in each cluster is: {weights}')