Skip to content

Commit

Permalink
Overhaul logistic regression scripts; fix a number of bugs.
Browse files Browse the repository at this point in the history
  • Loading branch information
rcurtin committed Oct 9, 2017
1 parent 26eda05 commit b71bde3
Show file tree
Hide file tree
Showing 10 changed files with 71 additions and 123 deletions.
4 changes: 2 additions & 2 deletions methods/matlab/LOGISTIC_REGRESSION.m
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ function logistic_regression(cmd)
X = csvread(regressorsFile{:});

if isempty(responsesFile)
y = X(:,end);
y = X(:,end) + 1; % We have to increment because labels must be positive.
X = X(:,1:end-1);
else
y = csvread(responsesFile{:});
Expand All @@ -47,7 +47,7 @@ function logistic_regression(cmd)
disp(sprintf('[INFO ] total_time: %fs', toc(total_time)))

if ~isempty(testFile)
csvwrite('predictions.csv', idx);
csvwrite('predictions.csv', idx - 1); % Subtract extra label bit.
csvwrite('matlab_lr_probs.csv', predictions);
end

Expand Down
16 changes: 10 additions & 6 deletions methods/matlab/logistic_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def RunMetrics(self, options):

# If the dataset contains two files then the second file is the test
# file. In this case we add this to the command line.
if len(self.dataset) == 2:
if len(self.dataset) >= 2:
inputCmd = "-i " + self.dataset[0] + " -t " + self.dataset[1]
else:
inputCmd = "-i " + self.dataset[0]
Expand Down Expand Up @@ -111,11 +111,15 @@ def RunMetrics(self, options):
truelabels = np.genfromtxt(self.dataset[2], delimiter = ',')
metrics['Runtime'] = timer.total_time
confusionMatrix = Metrics.ConfusionMatrix(truelabels, predictions)
metrics['ACC'] = Metrics.AverageAccuracy(confusionMatrix)
metrics['MCC'] = Metrics.MCCMultiClass(confusionMatrix)
metrics['Precision'] = Metrics.AvgPrecision(confusionMatrix)
metrics['Recall'] = Metrics.AvgRecall(confusionMatrix)
metrics['MSE'] = Metrics.SimpleMeanSquaredError(truelabels, predictions)

metrics['Avg Accuracy'] = Metrics.AverageAccuracy(confusionMatrix)
metrics['MultiClass Precision'] = Metrics.AvgPrecision(confusionMatrix)
metrics['MultiClass Recall'] = Metrics.AvgRecall(confusionMatrix)
metrics['MultiClass FMeasure'] = Metrics.AvgFMeasure(confusionMatrix)
metrics['MultiClass Lift'] = Metrics.LiftMultiClass(confusionMatrix)
metrics['MultiClass MCC'] = Metrics.MCCMultiClass(confusionMatrix)
metrics['MultiClass Information'] = Metrics.AvgMPIArray(confusionMatrix, truelabels, predictions)
metrics['Simple MSE'] = Metrics.SimpleMeanSquaredError(truelabels, predictions)

Log.Info(("total time: %fs" % (metrics['Runtime'])), self.verbose)

Expand Down
18 changes: 18 additions & 0 deletions methods/milk/logistic_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@ def RunLogisticRegressionMilk():
self.model = self.BuildModel()
with totalTimer:
self.model = self.model.train(trainData, labels)
if len(self.dataset) > 1:
# We get back probabilities; cast these to classes.
self.predictions = np.greater(self.model.apply(testData), 0.5)
except Exception as e:
return -1

Expand Down Expand Up @@ -112,4 +115,19 @@ def RunMetrics(self, options):

# Datastructure to store the results.
metrics = {'Runtime' : results}

if len(self.dataset) >= 3:
truelabels = LoadDataset(self.dataset[2])

confusionMatrix = Metrics.ConfusionMatrix(truelabels, self.predictions)

metrics['Avg Accuracy'] = Metrics.AverageAccuracy(confusionMatrix)
metrics['MultiClass Precision'] = Metrics.AvgPrecision(confusionMatrix)
metrics['MultiClass Recall'] = Metrics.AvgRecall(confusionMatrix)
metrics['MultiClass FMeasure'] = Metrics.AvgFMeasure(confusionMatrix)
metrics['MultiClass Lift'] = Metrics.LiftMultiClass(confusionMatrix)
metrics['MultiClass MCC'] = Metrics.MCCMultiClass(confusionMatrix)
metrics['MultiClass Information'] = Metrics.AvgMPIArray(confusionMatrix, truelabels, self.predictions)
metrics['Simple MSE'] = Metrics.SimpleMeanSquaredError(truelabels, self.predictions)

return metrics
2 changes: 1 addition & 1 deletion methods/mlpack/logistic_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def OptionsToStr(self, options):
optionsStr = "-e " + str(options.pop("epsilon"))
if "max_iterations" in options:
optionsStr = optionsStr + " -n " + str(options.pop("max_iterations"))
if "optimizer" in options:
if "algorithm" in options:
optionsStr = optionsStr + " -O " + str(options.pop("optimizer"))
if "step_size" in options:
optionsStr = optionsStr + " -s " + str(options.pop("step_size"))
Expand Down
2 changes: 1 addition & 1 deletion methods/scikit/logistic_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def RunLogisticRegressionScikit():
# Use the last row of the training set as the responses.
X, y = SplitTrainData(self.dataset)
if "algorithm" in options:
self.opts["algorithm"] = str(options.pop("algorithm"))
self.opts["solver"] = str(options.pop("algorithm"))
if "epsilon" in options:
self.opts["epsilon"] = float(options.pop("epsilon"))
if "max_iterations" in options:
Expand Down
7 changes: 7 additions & 0 deletions methods/shogun/logistic_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def __init__(self, dataset, timeout=0, verbose=True):
self.predictions = None
self.z = 1
self.model = None
self.max_iter = None

'''
Build the model for the Logistic Regression.
Expand All @@ -63,6 +64,8 @@ def BuildModel(self, data, responses):
# Create and train the classifier.
model = MulticlassLogisticRegression(self.z, RealFeatures(data.T),
MulticlassLabels(responses))
if self.max_iter is not None:
model.set_max_iter(self.max_iter);
model.train()
return model

Expand All @@ -87,6 +90,10 @@ def RunLogisticRegressionShogun():
# Use the last row of the training set as the responses.
X, y = SplitTrainData(self.dataset)

# Get the maximum number of iterations.
if "max_iterations" in options:
self.max_iter = int(options.pop("max_iterations"))

# Get the regularization value.
if "lambda" in options:
self.z = float(options.pop("lambda"))
Expand Down
22 changes: 15 additions & 7 deletions methods/weka/logistic_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,11 @@ def __del__(self):
def RunMetrics(self, options):
Log.Info("Perform Logistic Regression.", self.verbose)

maxIterStr = ""
if 'max_iterations' in options:
maxIterStr = " -m " + str(options['max_iterations']) + " "
options.pop('max_iterations')

if len(options) > 0:
Log.Fatal("Unknown parameters: " + str(options))
raise Exception("unknown parameters")
Expand All @@ -79,8 +84,8 @@ def RunMetrics(self, options):

# Split the command using shell-like syntax.
cmd = shlex.split("java -classpath " + self.path + "/weka.jar" +
":methods/weka" + " LOGISTICREGRESSION -t " + self.dataset[0] + " -T " +
self.dataset[1])
":methods/weka" + " LogisticRegression -t " + self.dataset[0] + " -T " +
self.dataset[1] + maxIterStr)

# Run command with the nessecary arguments and return its output as a byte
# string. We have untrusted input so we disable all shell based features.
Expand All @@ -105,11 +110,14 @@ def RunMetrics(self, options):
truelabels = np.genfromtxt(self.dataset[2], delimiter = ',')
metrics['Runtime'] = timer.total_time
confusionMatrix = Metrics.ConfusionMatrix(truelabels, predictions)
metrics['ACC'] = Metrics.AverageAccuracy(confusionMatrix)
metrics['MCC'] = Metrics.MCCMultiClass(confusionMatrix)
metrics['Precision'] = Metrics.AvgPrecision(confusionMatrix)
metrics['Recall'] = Metrics.AvgRecall(confusionMatrix)
metrics['MSE'] = Metrics.SimpleMeanSquaredError(truelabels, predictions)
metrics['Avg Accuracy'] = Metrics.AverageAccuracy(confusionMatrix)
metrics['MultiClass Precision'] = Metrics.AvgPrecision(confusionMatrix)
metrics['MultiClass Recall'] = Metrics.AvgRecall(confusionMatrix)
metrics['MultiClass FMeasure'] = Metrics.AvgFMeasure(confusionMatrix)
metrics['MultiClass Lift'] = Metrics.LiftMultiClass(confusionMatrix)
metrics['MultiClass MCC'] = Metrics.MCCMultiClass(confusionMatrix)
metrics['MultiClass Information'] = Metrics.AvgMPIArray(confusionMatrix, truelabels, predictions)
metrics['Simple MSE'] = Metrics.SimpleMeanSquaredError(truelabels, predictions)

Log.Info(("total time: %fs" % (metrics['Runtime'])), self.verbose)

Expand Down
102 changes: 0 additions & 102 deletions methods/weka/src/LOGISTICREGRESSION.java

This file was deleted.

17 changes: 15 additions & 2 deletions methods/weka/src/LogisticRegression.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import java.io.IOException;
import weka.core.*;
import weka.core.converters.ConverterUtils.DataSource;
import weka.core.converters.CSVLoader;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.NumericToNominal;

Expand All @@ -29,7 +30,8 @@ public class LogisticRegression {
+ " the last row of the input file.\n\n"
+ "Options:\n\n"
+ "-t [string] Optional file containing containing\n"
+ " test dataset");
+ " test dataset\n"
+ "-m [int] Maximum number of iterations\n");

public static HashMap<Integer, Double> createClassMap(Instances Data) {
HashMap<Integer, Double> classMap = new HashMap<Integer, Double>();
Expand Down Expand Up @@ -69,6 +71,8 @@ public static void main(String args[]) {

// Load input dataset.
DataSource source = new DataSource(regressorsFile);
if (source.getLoader() instanceof CSVLoader)
((CSVLoader) source.getLoader()).setNoHeaderRowPresent(true);
Instances data = source.getDataSet();

// Transform numeric class to nominal class because the
Expand All @@ -81,12 +85,19 @@ public static void main(String args[]) {
nm.setInputFormat(data);
data = Filter.useFilter(data, nm);

boolean hasMaxIters = false;
int maxIter = Integer.parseInt(Utils.getOption('m', args));
if (maxIter != 0)
hasMaxIters = true;

// Did the user pass a test file?
String testFile = Utils.getOption('t', args);
Instances testData = null;
if (testFile.length() != 0)
{
source = new DataSource(testFile);
if (source.getLoader() instanceof CSVLoader)
((CSVLoader) source.getLoader()).setNoHeaderRowPresent(true);
testData = source.getDataSet();

// Weka makes the assumption that the structure of the training and test
Expand Down Expand Up @@ -122,6 +133,8 @@ public static void main(String args[]) {
// Perform Logistic Regression.
timer.StartTimer("total_time");
weka.classifiers.functions.Logistic model = new weka.classifiers.functions.Logistic();
if (hasMaxIters)
model.setMaxIts(maxIter);
model.buildClassifier(data);

// Use the testdata to evaluate the modell.
Expand All @@ -140,7 +153,7 @@ public static void main(String args[]) {
}
FileWriter writer = new FileWriter(probabs.getName(), false);

File predictions = new File("weka_lr_predictions.csv");
File predictions = new File("weka_predicted.csv");
if(!predictions.exists()) {
predictions.createNewFile();
}
Expand Down
4 changes: 2 additions & 2 deletions util/timer.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,10 @@ def timeout(fun, timeout=9000):
p.join()

Log.Warn("Script timed out after " + str(timeout) + " seconds")
return -2
return [-2]
else:
try:
r = q.get(timeout=3)
except Exception as e:
r = -1
r = [-1]
return r

0 comments on commit b71bde3

Please sign in to comment.