VotePredictor/validate.py at master · mstfbl/VotePredictor · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import os
import sys
import json
import nltk
import time
import random
from heapq import heappush, heappop, heappushpop, nsmallest
from math import log
DEBUG = 1

def main():
  # Usage
  # python validate.py start numIter step
  # python validate.py start numIter
  # python validate.py start
  # python validate.py

  with open('model.json') as d:
    model = json.load(d)
  with open('validation_set.json') as d:
    validation_set = json.load(d)

  # define range of c values to test
  c = 9
  numIter = 1
  step = 1
  if len(sys.argv) >= 2:
    c = int(sys.argv[1])
  if len(sys.argv) >= 3:
    numIter = int(sys.argv[2])
  if len(sys.argv) == 4:
    step = int(sys.argv[3])

  # validate over all c values
  for i in range(numIter):
    startTime = time.time()
    results = validate(validation_set, model, i * step + c)
    endTime = time.time()
    legislator_success = list()
    for legislator in results[0]:
      legislator_success.append(results[0][legislator].get("success", 0) / float(results[0][legislator].get("total", 1)))
    if len(sys.argv) != 1 :
      dprint("Words Considered: ",  str(step * i + c))
    dprint("Success rate on average for predicting votes of Congressmen",sum(legislator_success)/ len(legislator_success))
    dprint("Time to validate",str((endTime-startTime)//60) + " minutes " + str((endTime-startTime)%60) + " seconds")
    correct = 0

    # count number of correctly predicted bills
    for vote in results[1]:
      if results[1][vote]:
        correct += 1
    dprint("Correct votes: ",str(float(correct) / len(results[1])))

    with open('results' + str(i * step + c) + '.txt', 'w') as f:
      for p in legislator_success:
        f.write("%s\n" % str(p))
      f.write("%s\n" % ("Legislator Avg: " + str(sum(legislator_success)/ len(legislator_success))))
      f.write("%s\n" % ("Bills Correct: " + str(float(correct) / len(results[1]))))

#Validate predicted votes with actual votes
def validate(validation_set, model, c):
  legislator_results = {}
  vote_results = {}
  count = 0
  dprint("Length of validation set",len(validation_set))
  for vote in validation_set:
    with open("idf.json") as d:
      idf = json.load(d)
    count += 1
    vote_count = [0,0,0]

    #Restricted TF-IDF version of bill texts, cmd line arg for TF-IDF hyperparameter
    # Make billtext a list of tuples of words and their counts in the text (repititions
    if len(sys.argv) >= 2:
      billText = tfidf(nltk.word_tokenize(validation_set[vote]["bill"]["text"]), idf, c)
    else:
      billText = nltk.word_tokenize(validation_set[vote]["bill"]["text"])
      billText = [(word, 1) for word in billText]

    #Background info: In Congress, due to small legislature differences, some
    #bills are votes as Nay or No and Aye or Yea. For these purposes, we count
    #Nay & No and Aye & Yeah the same

    #Label = 0 means Nay/No
    #Label = 1 means Yea/Aye

    def validateXVotes(givenVote,givenLabel):
      if givenVote in validation_set[vote]["votes"]:
        for legislator in validation_set[vote]["votes"][givenVote]:
          if legislator["id"] not in model:
            dprint("Congressman not seen in training set preset in model test - ", givenVote)
            continue
          label = generate_label(model[legislator["id"]], billText, vote_count)
          if legislator["id"] not in legislator_results:
            legislator_results[legislator["id"]] = {}
          #If predicted correctly
          if label == givenLabel:
            legislator_results[legislator["id"]]["success"] = legislator_results[legislator["id"]].get("success", 0) + 1
          legislator_results[legislator["id"]]["total"] = legislator_results[legislator["id"]].get("total", 0) + 1

    #Validating votes predicted
    validateXVotes("Nay",0)
    validateXVotes("No",0)
    validateXVotes("Yea",1)
    validateXVotes("Aye",1)
    validateXVotes("Not Voting",2)

    # determine if model passed of failed a bill
    model_result = (vote_count[1] / float(vote_count[0] + vote_count[1])) >= validation_set[vote]["requires"]
    if model_result == validation_set[vote]["result"]:
      vote_results[vote] = True
    else:
      vote_results[vote] = False

  return [legislator_results, vote_results]

#Generate predictions for bills given the legislator(Congressman)
def generate_label(legislator, billText, vote_count):
  #Probability of votes being Nay vs Yea, as well as not voting
  p_nay = 0.
  p_yea = 0.
  p_not_voting = 0.
  # Laplacian Prior
  k = 3
  unique_words = len(set([word for (word, _) in billText]))
  for (word, count) in billText:
  #Modifying Nay vs Yeah probabilities given each word
    word = word.lower()
    if "Nay" in legislator:
      p_nay += log((legislator["Nay"].get(word, 0) + k) / (float(legislator["Nay"].get("total_wc !@#", 0) + k * unique_words)))
    if "Yea" in legislator:
      p_yea += log((legislator["Yea"].get(word, 0) + k) / (float(legislator["Yea"].get("total_wc !@#", 0) + k * unique_words)))
    if "Not Voting" in legislator:
      p_not_voting += log((legislator["Not Voting"].get(word, 0) + k) / (float(legislator["Not Voting"].get("total_wc !@#", 0) + k * unique_words)))
  #Choose the highest probability label
  p_max = max(p_nay,p_yea,p_not_voting)
  if p_max == p_nay:
    vote_count[0] += 1
    return 0
  elif p_max == p_yea:
    vote_count[1] += 1
    return 1
  else:
    vote_count[2] += 1
    return 2

#The TF-IDF algorithm with hyperparameters
def tfidf(billText, idf, c):
  word_count = {}
  for word in billText:
    word = word.lower()
    word_count[word] = word_count.get(word, 0) + 1
  length = len(billText)
  heap = []
  #Obtain the most important words using TF-IDF with min-heap
  for word in word_count:
    tfidf_val = (word_count[word] / float(length)) * (log(idf["total_wc !@#"]) / idf.get(word, 1))
    if len(heap) < c:
      heappush(heap, (tfidf_val, word))
    elif heap[0][0] < tfidf_val:
      heappushpop(heap, (tfidf_val, word))
  return [(word, word_count[word]) for (_, word) in heap]

def dprint(explanation,msg):
  if DEBUG == 1:
    print(explanation + ": " + str(msg))

if __name__ == "__main__":
  main()