Merge pull request #1 from opiethehokie/2016

2016 updates
opiethehokie · Aug 17, 2016 · aee6637 · aee6637
2 parents 5fa92c6 + 8d6daed
commit aee6637
Show file tree

Hide file tree

Showing 23 changed files with 77,674 additions and 2,807,991 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,7 @@
+data/train.csv
+data/predict.csv
+data/rankings.csv
+*.pyc
+submissions/submission.csv
+submissions/bracket.txt
+venv/
diff --git a/.pydevproject b/.pydevproject
@@ -1,8 +1,8 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<?eclipse-pydev version="1.0"?><pydev_project>
-<pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
-<path>/${PROJECT_DIR_NAME}</path>
-</pydev_pathproperty>
-<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.7</pydev_property>
-<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Default</pydev_property>
-</pydev_project>
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?eclipse-pydev version="1.0"?><pydev_project>
+<pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
+<path>/${PROJECT_DIR_NAME}</path>
+</pydev_pathproperty>
+<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.7</pydev_property>
+<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Default</pydev_property>
+</pydev_project>
diff --git a/README.md b/README.md
@@ -1,20 +1,28 @@
 # march-madness-predictions
 
+Predict win probabilities for tournement games in Kaggle March Machine Learning Mania. Generate bracket based on probabilities to be used in traditional pools. Learn about machine learning.
+
 ## 2015 Results
 - missed 14 vs. 3 upsets badly
-- missed multiple Michigan St. wins
-- predicted a confidence of .5 for 4 matchups so not sure who I predicted to win
-- correct on 44 of first 62 games (not counting the "ties") correctly, picked Duke to beat Wisconsin in final with confidence=.63
-- did slightly worse than seed-based benchmark in Kaggle competition
+- missed multiple Michigan St. wins and early Villanova loss
+- correct on 44 of first 62 games (not counting the "ties"), picked Duke to beat Wisconsin in final with confidence=.63
+- did slightly worse than [seed-based benchmark][] in Kaggle competition
+
+## 2016 Results
 
 ## TODO
 
-- in-memory DB instead of all the dictionaries of dictionaries
-- different classifiers will work better with different features, probably better to pick one instead of the ensemble
-- try using different feature selection techniques instead of (or in addition to?) PCA
-- make sure new features (and existing?) account for the number of games played in regular season if appropriate
-- in 2015 all the games I had confidence=.5 were won by higher seed, that should probably be tie breaker
-- what do you do when a team loses early but would have been predicted to beat teams it would face in later rounds, or is predicted to win several games in a row with low confidences?
-- checkout http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html for new features
-- checkout CP-coding instead of one-hot encoding
-- http://googleresearch.blogspot.com/2015/08/the-reusable-holdout-preserving.html
+- use virtualenv and a requirements.txt for dependencies
+- add headers to transformation data
+- results vary every run, find out how to make it more stable
+- use sklearn pipeline (implement blending step) to include feature selection in CV
+
+## FUTURE IDEAS
+
+- develop other rankings based on Colley or Massey methods (see http://netprophetblog.blogspot.com/2015/09/massey-example.html and https://www.kaggle.com/c/march-machine-learning-mania-2016/forums/t/19551/converting-spreads-to-win-percentages-and-vice-versa)
+- add more ML techniques
+- visualizations
+- threshold for upsets when creating bracket maybe should be higher than 50% ?
+
+
+[seed-based benchmark]: https://www.kaggle.com/c/march-machine-learning-mania-2016/forums/t/18902/understanding-the-benchmark-submissions
diff --git a/bracket.py b/bracket.py
@@ -0,0 +1,89 @@
+#   Copyright 2016 Michael Peters
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+import csv
+import yaml
+import yamlordereddictloader
+
+from collections import OrderedDict
+
+
+def team_id_mapping():
+    with open('data/teams.csv') as csvfile:
+        lines = csvfile.readlines()[1:]
+        reader = csv.reader(lines)
+        teams = { v:int(k) for k, v in reader }
+    return teams
+
+def tourney_format():
+    # yaml format doesn't have an explicit ordering but intentionally keep order from file
+    return yaml.load(open('data/tourney-2016.yml'), Loader=yamlordereddictloader.Loader)
+
+def prediction_confidences():
+    with open('submissions/submission.csv') as csvfile:
+        lines = csvfile.readlines()[1:]
+        reader = csv.reader(lines)
+        predictions = { k[5:]:float(v) for k, v in reader }
+    return predictions
+
+teamid = team_id_mapping()
+tourney = tourney_format()
+predictions = prediction_confidences()
+
+def pairwise(it):
+    it = iter(it)
+    while True:
+        yield next(it), next(it)
+
+def log(f, msg):
+    print msg
+    f.write(msg + "\n")
+
+with open('submissions/bracket.txt', 'w') as f:
+
+    def simulate(teams, rnd):
+        if len(teams) > 1:
+            log(f, "\nROUND %d:" % rnd)
+            winners = OrderedDict()
+            if rnd == 0:
+                for seed, team in teams.iteritems():
+                    if '|' in team:
+                        teama, teamb = team.split(' | ')
+                        winners = teams
+                        (_, winner) = play_game(teama, seed, teamb, seed)
+                        winners[seed] = winner
+            else:
+                for seeda, seedb in pairwise(teams.keys()):
+                    teama = teams[seeda]
+                    teamb = teams[seedb]
+                    wseed, winner = play_game(teama, seeda, teamb, seedb)
+                    winners[wseed] = winner
+            rnd += 1
+            simulate(winners, rnd)
+
+    def play_game(teama, seeda, teamb, seedb):
+        teama_id = teamid[teama]
+        teamb_id = teamid[teamb]
+        if teama_id < teamb_id:
+            matchup = "%d_%d" % (teama_id, teamb_id)
+            prediction = predictions[matchup]
+            winner = teama if prediction >= .5 else teamb
+        else:
+            matchup = "%d_%d" % (teamb_id, teama_id)
+            prediction = predictions[matchup]
+            winner = teamb if prediction >= .5 else teama
+        log(f, "%s %s vs %s %s = %s %f" % (seeda, teama, seedb, teamb, winner, prediction))
+        return (seeda, winner) if winner == teama else (seedb, winner)
+
+    simulate(tourney, 0)
diff --git a/constants.py b/constants.py
@@ -0,0 +1,21 @@
+#   Copyright 2016 Michael Peters
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+TOURNEY_START_DAY = 136
+REAL_SEASON_START_DAY = 25
+
+MIN_RPI = .525
+PYTHAGOREAN_EXPECTATION_EXP = 8.5
+MAX_SCORE_MARGIN = 20
+HOME_COURT_ADVANTAGE = 2