-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathgen_CommandLine.py
More file actions
205 lines (155 loc) · 8.86 KB
/
Copy pathgen_CommandLine.py
File metadata and controls
205 lines (155 loc) · 8.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 29 12:08:29 2014
@author: Brenton.Mallen
This script is intended to facilitate a grid search for Vowpal Wabbit for the
Kaggle Shopping competition.
This code will have 2 main parts. Part 1: Generate the command line code to run
VW for both quantile and square loss functions - loop through variables. Part
2: determine performace for each itteration.
"""
#-------------------------------------------------------------------------
def gen_DataSplit(Training, TrainLabels,Test_Split_out, Train_Split_out, Train_Split_Label, Test_Split_Label, testSize):
''' This function is used to split the training data into two subsets to use for validation
It goes through the full training data and slits up based on "testSize" input.
It also creates label files for each subset - this is used for auc calculation later.
These are all saved to separate files.
Training - the input training data set
TrainLabels - vector of the labels
Test_Split_out, Train_Split_out, Train_Split_Label, Test_Split_Label - the output file names
'''
from numpy import floor
for e,line in enumerate(open(Training)): # get number of rows in training set
pass
numRows = e + 1
testNum = floor(numRows * testSize)
# Create Training subset and corresponding labels
with open(Train_Split_out,'wb') as TTrain:
with open(Train_Split_Label, 'wb') as TTrainL:
for i,l in enumerate(open(Training)):
if i > testNum:
TTrain.write(l)
TTrainL.write(l[0][0] + '\n')
# Create Training subset and corresponding labels
with open(Test_Split_out,'wb') as TTest:
with open(Test_Split_Label,'wb') as TTestL:
for a,q in enumerate(open(Training)):
if a <= testNum:
TTest.write(q)
TTestL.write(q[0][0] + '\n')
#-------------------------------------------------------------------------
def gen_CommandLine(TrainingData, TestData, Model_Temp, Preds_out, LossChoice,\
numPasses, LearnRate, Hessian, RandWeights, Tau):
'''The function is used to generate the command line text to initiate vowpal.
The vw parameters are the inputs to the function.
TrainingData, TestData - input file names as strings
Model_Temp, Preds_out - strings of the file names or vw outputs
LossChoice, numPasses, LearnRate, Tau - numerical value inputs
if using squared loss, set Tau = []
Hessian, RandWeights - boolean inputs'''
import sys, os
#%%-------------------------Square Loss----------------------------------------
if LossChoice == 'squared':
# Random Weights Activation
if RandWeights == 'on':
RW = ' --random_weights(on) '
elif RandWeights == 'off':
RW = ''
else:
sys.exit('DANGER Will Robinson!: Invalid argument for Random Weights activation')
# Hessian Activation
if Hessian == 'on':
Hess = 'hessian_on'
elif Hessian == 'off':
Hess = ''
else:
sys.exit('DANGER Will Robinson!: Invalid argument for Hessian activation')
Command_Train = 'vw ' + TrainingData + ' -c -k --passes '+ str(numPasses) +' -l ' + str(LearnRate) + \
' -f ' + Model_Temp + ' --loss_function squared' + RW + Hess
# print 'Loss: ' + str(LossChoice) + '# Passes: ' + str(numPasses) + 'Learn Rate: '\
# + str(LearnRate) + 'Hessian: ' + Hessian + 'Random Weights: ' + RandWeights
#%%------------------------Quantile Loss---------------------------------------
elif LossChoice == 'quantile':
# Random Weights Activation
if RandWeights == 'on':
RW = ' --random_weights(on) '
elif RandWeights == 'off':
RW = ''
else:
sys.exit('DANGER Will Robinson!: Invalid argument for Random Weights activation')
# Hessian Activation
if Hessian == 'on':
Hess = 'hessian_on'
elif Hessian == 'off':
Hess = ''
else:
sys.exit('DANGER Will Robinson!: Invalid argument for Hessian activation')
Command_Train = 'vw ' + TrainingData + ' -c -k --passes '+ str(numPasses) +' -l ' + str(LearnRate) + \
' -f ' + Model_Temp + ' --loss_function quantile --quantile_tau ' + str(Tau) + RW + Hess
# print 'Loss: ' + str(LossChoice) + 'Tau: ' + str(Tau) + '# Passes: ' + str(numPasses) + 'Learn Rate: '\
# + str(LearnRate) + 'Hessian: ' + Hessian + 'Random Weights: ' + RandWeights
else:
sys.exit('DANGER Will Robinson!: Invalid argument for Loss Choice')
#%%------------Train & Run on Test Data ----------------------------------
Command_Test = 'vw ' + TestData + ' -t -i ' + Model_Temp + ' -p ' + Preds_out
os.system(Command_Train) # send command to terminal
os.system(Command_Test)
return Preds_out
'''============================================================================
============ Validate Models ==============
============================================================================'''
def perf_metric(InputLabels, Predictions):
'''This function is used to calculate the AUC metric.
InputLabels - the vector of true labels for the training subset
Predictions - output from vowpal'''
import numpy as np
from sklearn.metrics import roc_auc_score
L = np.loadtxt(InputLabels)
P = np.loadtxt(Predictions)[:,0]
AUC = roc_auc_score(L,P)
return AUC
#%%============================================================================
'''============================================================================
============ Run on Data =====================
============================================================================'''
if __name__ == '__main__':
Directory = '/media/sf_Docs/Kaggle/Shopping/Data/'
TrainingData = Directory + 'train.vw'
Test_Split_out = Directory + 'train_Test_Split.vw'
Train_Split_out = Directory + 'train_Train_Split.vw'
Test_Split_Label = Directory + 'Test_Split_Labels.vw'
Train_Split_Label = Directory + 'Train_Split_Labels.vw'
TrainLabels = Directory + 'ReducedTrain_Labels.vw'
Preds_out = Directory + 'Preds_out.txt'
Model_Temp = Directory + 'model_temp.vw'
testSize = 0.6
LossChoice = ('squared','quantile') # squared or quaintile
numPasses = np.arange(50,500,50) # number of passes. Same for each loss function case
LearnRate = np.arange(0.35,1,0.05) # learning rate. 0.5 = Default
Hessian = ('off','on') # On or Off as string
RandWeights = ('off','on') # On or Off as string
Tau = np.arange(0.25,1,0.05) # Quantile tau
gen_DataSplit(TrainingData, TrainLabels, Test_Split_out, Train_Split_out,Train_Split_Label,\
Test_Split_Label, testSize)
with open(Directory + 'VW_Results.txt','wb') as outfile:
for lossC in LossChoice:
for numP in numPasses:
for learnR in LearnRate:
for hess in Hessian:
for weights in RandWeights:
if lossC == 'quantile':
for t_ow in Tau:
Predict = gen_CommandLine(Train_Split_out, Test_Split_out, Model_Temp, Preds_out,\
lossC, numP, learnR, hess, weights, t_ow)
AUC = round(perf_metric(Test_Split_Label,Predict),8)
# R = open(Directory + 'VW_Results.txt','wb') as outfile:
outfile.write ('Loss: ' + lossC + ', Tau: ' + str(t_ow) + ', Passes:' + str(numP) + ', LearnRate: ' + str(learnR) + ', Hessian: '\
+ hess + ', RandWeights: ' + weights\
+ ', AUC: ' + str(AUC) + '\n')
else:
Predict = gen_CommandLine(Train_Split_out, Test_Split_out, Model_Temp, Preds_out,\
lossC, numP, learnR, hess, weights,[])
AUC = round(perf_metric(Test_Split_Label,Predict),8)
outfile.write ('Loss: ' + lossC + ', Passes:' + str(numP) + ', LearnRate: ' + str(learnR) + ', Hessian: '\
+ hess + ', RandWeights: ' + weights\
+ ', AUC: ' + str(AUC) + '\n')