-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapp.py
More file actions
263 lines (210 loc) · 10.2 KB
/
app.py
File metadata and controls
263 lines (210 loc) · 10.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
import streamlit as st
import numpy as np
import pickle
from models.logistic_regression import LogisticRegression
from libs.sense_proc import TextPreprocessor
# Define the custom DecisionTree class that was used to save the model
# This must exactly match the class that was used when saving the model
class TreeNode:
def __init__(self, feature_index=None, threshold=None, left=None, right=None, *, value=None):
self.feature_index = feature_index
self.threshold = threshold
self.left = left
self.right = right
self.value = value
def is_leaf_node(self):
return self.value is not None
class DecisionTreeClassifierCustom:
def __init__(self, max_depth=15, min_samples_split=10, max_thresholds=10):
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.max_thresholds = max_thresholds
self.root = None
def fit(self, X, y):
self.root = self._build_tree(X, y)
def _build_tree(self, X, y, depth=0):
num_samples, num_features = X.shape
num_labels = len(np.unique(y))
if depth >= self.max_depth or num_labels == 1 or num_samples < self.min_samples_split:
return TreeNode(value=self._most_common_label(y))
best_feat, best_thresh = self._best_split(X, y)
if best_feat is None:
return TreeNode(value=self._most_common_label(y))
left_indices = X[:, best_feat] <= best_thresh
right_indices = X[:, best_feat] > best_thresh
left = self._build_tree(X[left_indices], y[left_indices], depth + 1)
right = self._build_tree(X[right_indices], y[right_indices], depth + 1)
return TreeNode(feature_index=best_feat, threshold=best_thresh, left=left, right=right)
def _best_split(self, X, y):
best_gain = -1
best_index, best_thresh = None, None
for feature_index in range(X.shape[1]):
X_column = X[:, feature_index]
unique_values = np.unique(X_column)
if len(unique_values) > self.max_thresholds:
thresholds = np.linspace(min(unique_values), max(unique_values), self.max_thresholds)
else:
thresholds = unique_values
for threshold in thresholds:
gain = self._information_gain(y, X_column, threshold)
if gain > best_gain:
best_gain = gain
best_index = feature_index
best_thresh = threshold
return best_index, best_thresh
def _information_gain(self, y, feature_column, threshold):
parent_entropy = self._entropy(y)
left_mask = feature_column <= threshold
right_mask = feature_column > threshold
if len(y[left_mask]) == 0 or len(y[right_mask]) == 0:
return 0
n = len(y)
n_l, n_r = len(y[left_mask]), len(y[right_mask])
e_l = self._entropy(y[left_mask])
e_r = self._entropy(y[right_mask])
weighted_entropy = (n_l / n) * e_l + (n_r / n) * e_r
return parent_entropy - weighted_entropy
def _entropy(self, y):
counts = np.bincount(y)
probabilities = counts / len(y)
return -np.sum([p * np.log2(p) for p in probabilities if p > 0])
def _most_common_label(self, y):
from collections import Counter
return Counter(y).most_common(1)[0][0]
def predict(self, X):
return np.array([self._traverse_tree(x, self.root) for x in X])
def _traverse_tree(self, x, node):
if node.is_leaf_node():
return node.value
if x[node.feature_index] <= node.threshold:
return self._traverse_tree(x, node.left)
else:
return self._traverse_tree(x, node.right)
# Load vocab and idf
vocab = np.load("models_pretrained/vocab.npy", allow_pickle=True).tolist()
idf = np.load("models_pretrained/idf.npy", allow_pickle=True)
# Define available models
available_models = {
"Logistic Regression": "models_pretrained/logistic_regression.pkl",
"SVM": "models_pretrained/svm_model.pkl",
"DecisionTree": "models_pretrained/dt.pkl" # Updated path
}
# Define path to the vectorizer
dt_vectorizer_path = "models_pretrained/custom_tfidf_vectorizer.pkl" # Updated path
# Setup preprocessor
preprocessor = TextPreprocessor()
preprocessor.vocab = vocab
preprocessor.idf_vector = idf
# Streamlit UI config
st.set_page_config(page_title="SenseNet2D: sentiment analyzer", page_icon="📝", layout="centered")
st.title("SenseNet2D: sentiment analyzer")
# Add a debug mode toggle
debug_mode = st.sidebar.checkbox("Debug Mode", value=False)
# Model selection dropdown
selected_model_name = st.selectbox(
"Choose model:",
options=list(available_models.keys())
)
# User input
input_text = st.text_area("Enter your text here...", height=150)
# Test cases for quick testing
if st.sidebar.checkbox("Show Test Cases"):
test_cases = [
"I absolutely loved the product, it was amazing!",
"I am sad and disappointed.",
"Not sure what I feel about this.",
"Worst thing ever!",
"Such a beautiful day!",
]
selected_test = st.sidebar.selectbox("Select a test case:", test_cases)
if st.sidebar.button("Use Selected Test Case"):
input_text = selected_test
# Handle form submission
if st.button("Classify"):
if input_text.strip() == "":
st.warning("Please enter some text.")
else:
# Load the selected model
model_path = available_models[selected_model_name]
try:
# Load the model based on its type
if selected_model_name == "Logistic Regression":
with open(model_path, "rb") as f:
model_params = pickle.load(f)
model = LogisticRegression()
model.W = model_params['weights']
model.B = model_params['bias']
# Preprocess using your custom preprocessor
processed_text = preprocessor.preprocess(input_text)
feature_vector = preprocessor.tfidf([processed_text])
# Debug information
if debug_mode:
st.write("Model type: Logistic Regression")
st.write(f"Feature vector shape: {feature_vector.shape}")
# Predict
prediction = model.predict(feature_vector)[0]
sentiment = "Positive" if prediction == 1 else "Negative"
elif selected_model_name == "SVM":
# Load the SVM model components
with open(model_path, "rb") as f:
svm_data = pickle.load(f)
# Extract components
svm_model = svm_data['model']
vectorizer = svm_data['vectorizer']
scaler = svm_data['scaler']
# Use SVM's own preprocessing pipeline
feature_vector_raw = vectorizer.transform([input_text]).toarray()
feature_vector = scaler.transform(feature_vector_raw)
# Debug information
if debug_mode:
st.write("Model type: SVM")
st.write(f"Feature vector shape: {feature_vector.shape}")
# Predict
prediction = svm_model.predict(feature_vector)[0]
# Convert back using label_encoder if available
if 'label_encoder' in svm_data:
label_encoder = svm_data['label_encoder']
sentiment = label_encoder.inverse_transform([prediction])[0]
else:
sentiment = "Positive" if prediction == 1 else "Negative"
elif selected_model_name == "DecisionTree":
# Load the Decision Tree model
with open(model_path, "rb") as f:
dt_model = pickle.load(f)
# Load the TF-IDF vectorizer specifically for Decision Tree
try:
with open(dt_vectorizer_path, "rb") as f_vec:
dt_vectorizer = pickle.load(f_vec)
if debug_mode:
st.write("Model type: Decision Tree")
st.write("Using paired TF-IDF vectorizer")
# Transform input text using the specific vectorizer
feature_vector = dt_vectorizer.transform([input_text]).toarray()
if debug_mode:
st.write(f"Feature vector shape: {feature_vector.shape}")
# Show sample of feature vector
st.write("Sample of feature vector (first 10 values):")
st.write(feature_vector[0][:10])
# Check for zeros
non_zero = np.count_nonzero(feature_vector)
st.write(f"Non-zero features: {non_zero} out of {feature_vector.size}")
# Make prediction
prediction = dt_model.predict(feature_vector)[0]
if debug_mode:
st.write(f"Raw prediction value: {prediction}")
# Map prediction to sentiment
sentiment = "Positive" if prediction == 1 else "Negative"
except FileNotFoundError:
st.error(f"Decision Tree vectorizer not found at {dt_vectorizer_path}")
st.info("Please make sure the vectorizer is saved in the models_pretrained directory")
raise
# Display result with the model name
st.header(f"Prediction: {sentiment}")
st.info(f"Model used: {selected_model_name}")
except FileNotFoundError:
st.error(f"Model file not found: {model_path}")
st.info("Make sure the model file is in the correct location")
except Exception as e:
st.error(f"Error during prediction: {str(e)}")
import traceback
st.code(traceback.format_exc())