-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvalidate.py
More file actions
169 lines (151 loc) · 6.01 KB
/
Copy pathvalidate.py
File metadata and controls
169 lines (151 loc) · 6.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import os
import sys
import json
import nltk
import time
import random
from heapq import heappush, heappop, heappushpop, nsmallest
from math import log
DEBUG = 1
def main():
# Usage
# python validate.py start numIter step
# python validate.py start numIter
# python validate.py start
# python validate.py
with open('model.json') as d:
model = json.load(d)
with open('validation_set.json') as d:
validation_set = json.load(d)
# define range of c values to test
c = 9
numIter = 1
step = 1
if len(sys.argv) >= 2:
c = int(sys.argv[1])
if len(sys.argv) >= 3:
numIter = int(sys.argv[2])
if len(sys.argv) == 4:
step = int(sys.argv[3])
# validate over all c values
for i in range(numIter):
startTime = time.time()
results = validate(validation_set, model, i * step + c)
endTime = time.time()
legislator_success = list()
for legislator in results[0]:
legislator_success.append(results[0][legislator].get("success", 0) / float(results[0][legislator].get("total", 1)))
if len(sys.argv) != 1 :
dprint("Words Considered: ", str(step * i + c))
dprint("Success rate on average for predicting votes of Congressmen",sum(legislator_success)/ len(legislator_success))
dprint("Time to validate",str((endTime-startTime)//60) + " minutes " + str((endTime-startTime)%60) + " seconds")
correct = 0
# count number of correctly predicted bills
for vote in results[1]:
if results[1][vote]:
correct += 1
dprint("Correct votes: ",str(float(correct) / len(results[1])))
with open('results' + str(i * step + c) + '.txt', 'w') as f:
for p in legislator_success:
f.write("%s\n" % str(p))
f.write("%s\n" % ("Legislator Avg: " + str(sum(legislator_success)/ len(legislator_success))))
f.write("%s\n" % ("Bills Correct: " + str(float(correct) / len(results[1]))))
#Validate predicted votes with actual votes
def validate(validation_set, model, c):
legislator_results = {}
vote_results = {}
count = 0
dprint("Length of validation set",len(validation_set))
for vote in validation_set:
with open("idf.json") as d:
idf = json.load(d)
count += 1
vote_count = [0,0,0]
#Restricted TF-IDF version of bill texts, cmd line arg for TF-IDF hyperparameter
# Make billtext a list of tuples of words and their counts in the text (repititions
if len(sys.argv) >= 2:
billText = tfidf(nltk.word_tokenize(validation_set[vote]["bill"]["text"]), idf, c)
else:
billText = nltk.word_tokenize(validation_set[vote]["bill"]["text"])
billText = [(word, 1) for word in billText]
#Background info: In Congress, due to small legislature differences, some
#bills are votes as Nay or No and Aye or Yea. For these purposes, we count
#Nay & No and Aye & Yeah the same
#Label = 0 means Nay/No
#Label = 1 means Yea/Aye
def validateXVotes(givenVote,givenLabel):
if givenVote in validation_set[vote]["votes"]:
for legislator in validation_set[vote]["votes"][givenVote]:
if legislator["id"] not in model:
dprint("Congressman not seen in training set preset in model test - ", givenVote)
continue
label = generate_label(model[legislator["id"]], billText, vote_count)
if legislator["id"] not in legislator_results:
legislator_results[legislator["id"]] = {}
#If predicted correctly
if label == givenLabel:
legislator_results[legislator["id"]]["success"] = legislator_results[legislator["id"]].get("success", 0) + 1
legislator_results[legislator["id"]]["total"] = legislator_results[legislator["id"]].get("total", 0) + 1
#Validating votes predicted
validateXVotes("Nay",0)
validateXVotes("No",0)
validateXVotes("Yea",1)
validateXVotes("Aye",1)
validateXVotes("Not Voting",2)
# determine if model passed of failed a bill
model_result = (vote_count[1] / float(vote_count[0] + vote_count[1])) >= validation_set[vote]["requires"]
if model_result == validation_set[vote]["result"]:
vote_results[vote] = True
else:
vote_results[vote] = False
return [legislator_results, vote_results]
#Generate predictions for bills given the legislator(Congressman)
def generate_label(legislator, billText, vote_count):
#Probability of votes being Nay vs Yea, as well as not voting
p_nay = 0.
p_yea = 0.
p_not_voting = 0.
# Laplacian Prior
k = 3
unique_words = len(set([word for (word, _) in billText]))
for (word, count) in billText:
#Modifying Nay vs Yeah probabilities given each word
word = word.lower()
if "Nay" in legislator:
p_nay += log((legislator["Nay"].get(word, 0) + k) / (float(legislator["Nay"].get("total_wc !@#", 0) + k * unique_words)))
if "Yea" in legislator:
p_yea += log((legislator["Yea"].get(word, 0) + k) / (float(legislator["Yea"].get("total_wc !@#", 0) + k * unique_words)))
if "Not Voting" in legislator:
p_not_voting += log((legislator["Not Voting"].get(word, 0) + k) / (float(legislator["Not Voting"].get("total_wc !@#", 0) + k * unique_words)))
#Choose the highest probability label
p_max = max(p_nay,p_yea,p_not_voting)
if p_max == p_nay:
vote_count[0] += 1
return 0
elif p_max == p_yea:
vote_count[1] += 1
return 1
else:
vote_count[2] += 1
return 2
#The TF-IDF algorithm with hyperparameters
def tfidf(billText, idf, c):
word_count = {}
for word in billText:
word = word.lower()
word_count[word] = word_count.get(word, 0) + 1
length = len(billText)
heap = []
#Obtain the most important words using TF-IDF with min-heap
for word in word_count:
tfidf_val = (word_count[word] / float(length)) * (log(idf["total_wc !@#"]) / idf.get(word, 1))
if len(heap) < c:
heappush(heap, (tfidf_val, word))
elif heap[0][0] < tfidf_val:
heappushpop(heap, (tfidf_val, word))
return [(word, word_count[word]) for (_, word) in heap]
def dprint(explanation,msg):
if DEBUG == 1:
print(explanation + ": " + str(msg))
if __name__ == "__main__":
main()