This page serves as a dataset reference. It currently shows a header for each dataset, but I'll update it with notes as I uncover important aspects of the data.

Original source: https://www.kaggle.com/c/mens-machine-learning-competition-2019/data

Data Section 1 - The Basics

import pandas as pd
import numpy as np
from tabulate import tabulate
import os
def print_df(df):
    print(tabulate(df, headers="keys",tablefmt="html"))
print(os.listdir('../input/datafiles/'))
['SecondaryTourneyTeams.csv', 'Players_2012.csv', 'GameCities.csv', 'Events_2010.csv', 'Players_2013.csv', 'Players_2017.csv', 'Events_2011.csv', 'Events_2017.csv', 'Players_2014.csv', 'Events_2018.csv', 'NCAATourneySlots.csv', 'MasseyOrdinals.csv', 'TeamConferences.csv', 'NCAATourneyDetailedResults.csv', 'NCAATourneySeedRoundSlots.csv', 'NCAATourneySeeds.csv', 'NCAATourneyCompactResults.csv', 'TeamCoaches.csv', 'Players_2015.csv', 'Seasons.csv', 'Cities.csv', 'Players_2016.csv', 'Events_2014.csv', 'Players_2018.csv', 'Events_2012.csv', 'Events_2013.csv', 'SecondaryTourneyCompactResults.csv', 'Players_2011.csv', 'RegularSeasonCompactResults.csv', 'Events_2016.csv', 'Events_2015.csv', 'Teams.csv', 'Players_2010.csv', 'Conferences.csv', 'ConferenceTourneyGames.csv', 'TeamSpellings.csv', 'RegularSeasonDetailedResults.csv']

RegularSeasonCompactResults

RegularSeasonCompactResults = pd.read_csv('../input/datafiles/RegularSeasonCompactResults.csv')
print("size = ", RegularSeasonCompactResults.shape)
print_df(RegularSeasonCompactResults.head())
size = (156089, 8)
Season DayNum WTeamID WScore LTeamID LScoreWLoc NumOT
0 1985 20 1228 81 1328 64N 0
1 1985 25 1106 77 1354 70H 0
2 1985 25 1112 63 1223 56H 0
3 1985 25 1165 70 1432 54H 0
4 1985 25 1192 86 1447 74H 0

NCAATourneyCompactResults

NCAATourneyCompactResults = pd.read_csv('../input/datafiles/NCAATourneyCompactResults.csv')
print("size = ", NCAATourneyCompactResults.shape)
print_df(NCAATourneyCompactResults.head())
size = (2184, 8)
Season DayNum WTeamID WScore LTeamID LScoreWLoc NumOT
0 1985 136 1116 63 1234 54N 0
1 1985 136 1120 59 1345 58N 0
2 1985 136 1207 68 1250 43N 0
3 1985 136 1229 58 1425 55N 0
4 1985 136 1242 49 1325 38N 0

NCAATourneySeeds

NCAATourneySeeds = pd.read_csv('../input/datafiles/NCAATourneySeeds.csv')
print("size = ", NCAATourneySeeds.shape)
print_df(NCAATourneySeeds.head())
size = (2218, 3)
SeasonSeed TeamID
0 1985W01 1207
1 1985W02 1210
2 1985W03 1228
3 1985W04 1260
4 1985W05 1374

Seasons

Seasons = pd.read_csv('../input/datafiles/Seasons.csv')
print("size = ", Seasons.shape)
print_df(Seasons.head())
size = (35, 6)
SeasonDayZero RegionW RegionX RegionY RegionZ
0 198510/29/1984East West Midwest Southeast
1 198610/28/1985East Midwest SoutheastWest
2 198710/27/1986East SoutheastMidwest West
3 198811/2/1987 East Midwest SoutheastWest
4 198910/31/1988East West Midwest Southeast

Teams

Teams = pd.read_csv('../input/datafiles/Teams.csv')
print("size = ", Teams.shape)
print_df(Teams.head())
size = (366, 4)
TeamIDTeamName FirstD1Season LastD1Season
0 1101Abilene Chr 2014 2019
1 1102Air Force 1985 2019
2 1103Akron 1985 2019
3 1104Alabama 1985 2019
4 1105Alabama A&M 2000 2019

SampleSubmissionStage1

SampleSubmissionStage1 = pd.read_csv('../input/SampleSubmissionStage1.csv')
print("size = ", SampleSubmissionStage1.shape)
print_df(SampleSubmissionStage1.head())
size = (11390, 2)
ID Pred
02014_1107_1110 0.5
12014_1107_1112 0.5
22014_1107_1113 0.5
32014_1107_1124 0.5
42014_1107_1140 0.5

Data Section 2 - Team Box Scores

Variables:

  • WFGM - field goals made (by the winning team)
  • WFGA - field goals attempted (by the winning team)
  • WFGM3 - three pointers made (by the winning team)
  • WFGA3 - three pointers attempted (by the winning team)
  • WFTM - free throws made (by the winning team)
  • WFTA - free throws attempted (by the winning team)
  • WOR - offensive rebounds (pulled by the winning team)
  • WDR - defensive rebounds (pulled by the winning team)
  • WAst - assists (by the winning team)
  • WTO - turnovers committed (by the winning team)
  • WStl - steals (accomplished by the winning team)
  • WBlk - blocks (accomplished by the winning team)
  • WPF - personal fouls committed (by the winning team)

RegularSeasonDetailedResults

RegularSeasonDetailedResults = pd.read_csv('../input/datafiles/RegularSeasonDetailedResults.csv')
print("size = ", RegularSeasonDetailedResults.shape)
print_df(RegularSeasonDetailedResults.head())
size = (82041, 34)
Season DayNum WTeamID WScore LTeamID LScoreWLoc NumOT WFGM WFGA WFGM3 WFGA3 WFTM WFTA WOR WDR WAst WTO WStl WBlk WPF LFGM LFGA LFGM3 LFGA3 LFTM LFTA LOR LDR LAst LTO LStl LBlk LPF
0 2003 10 1104 68 1328 62N 0 27 58 3 14 11 18 14 24 13 23 7 1 22 22 53 2 10 16 22 10 22 8 18 9 2 20
1 2003 10 1272 70 1393 63N 0 26 62 8 20 10 19 15 28 16 13 4 4 18 24 67 6 24 9 20 20 25 7 12 8 6 16
2 2003 11 1266 73 1437 61N 0 24 58 8 18 17 29 17 26 15 10 5 2 25 22 73 3 26 14 23 31 22 9 12 2 5 23
3 2003 11 1296 56 1457 50N 0 18 38 3 9 17 31 6 19 11 12 14 2 18 18 49 6 22 8 15 17 20 9 19 4 3 23
4 2003 11 1400 77 1208 71N 0 30 61 6 14 11 13 17 22 12 14 4 4 20 24 62 6 16 17 27 21 15 12 10 7 1 14

NCAATourneyDetailedResults

NCAATourneyDetailedResults = pd.read_csv('../input/datafiles/NCAATourneyDetailedResults.csv')
print("size = ", NCAATourneyDetailedResults.shape)
print_df(NCAATourneyDetailedResults.head())
size = (1048, 34)
Season DayNum WTeamID WScore LTeamID LScoreWLoc NumOT WFGM WFGA WFGM3 WFGA3 WFTM WFTA WOR WDR WAst WTO WStl WBlk WPF LFGM LFGA LFGM3 LFGA3 LFTM LFTA LOR LDR LAst LTO LStl LBlk LPF
0 2003 134 1421 92 1411 84N 1 32 69 11 29 17 26 14 30 17 12 5 3 22 29 67 12 31 14 31 17 28 16 15 5 0 22
1 2003 136 1112 80 1436 51N 0 31 66 7 23 11 14 11 36 22 16 10 7 8 20 64 4 16 7 7 8 26 12 17 10 3 15
2 2003 136 1113 84 1272 71N 0 31 59 6 14 16 22 10 27 18 9 7 4 19 25 69 7 28 14 21 20 22 11 12 2 5 18
3 2003 136 1141 79 1166 73N 0 29 53 3 7 18 25 11 20 15 18 13 1 19 27 60 7 17 12 17 14 17 20 21 6 6 21
4 2003 136 1143 76 1301 74N 1 27 64 7 20 15 23 18 20 17 13 8 2 14 25 56 9 21 15 20 10 26 16 14 5 8 19

Data Section 3 - Geography

Cities

Cities = pd.read_csv('../input/datafiles/Cities.csv')
print("size = ", Cities.shape)
print_df(Cities.head())
size = (421, 3)
CityIDCity State
0 4001Abilene TX
1 4002Akron OH
2 4003Albany NY
3 4004AlbuquerqueNM
4 4005Allentown PA

GameCities

GameCities = pd.read_csv('../input/datafiles/GameCities.csv')
print("size = ", GameCities.shape)
print_df(GameCities.head())
size = (49235, 6)
Season DayNum WTeamID LTeamIDCRType CityID
0 2010 7 1143 1293Regular 4027
1 2010 7 1314 1198Regular 4061
2 2010 7 1326 1108Regular 4080
3 2010 7 1393 1107Regular 4340
4 2010 9 1143 1178Regular 4027

Data Section 4 - Public Rankings

MasseyOrdinals

MasseyOrdinals = pd.read_csv('../input/datafiles/MasseyOrdinals.csv')
print("size = ", MasseyOrdinals.shape)
print_df(MasseyOrdinals.head())
size = (3492320, 5)
Season RankingDayNumSystemName TeamID OrdinalRank
0 2003 35SEL 1102 159
1 2003 35SEL 1103 229
2 2003 35SEL 1104 12
3 2003 35SEL 1105 314
4 2003 35SEL 1106 260

Data Section 5 - Play by Play

Events_2015

Events_2015 = pd.read_csv('../input/datafiles/Events_2015.csv')
print("size = ", Events_2015.shape)
print_df(Events_2015.head())
size = (2548633, 11)
EventID Season DayNum WTeamID LTeamID WPoints LPoints ElapsedSeconds EventTeamID EventPlayerIDEventType
0 12703749 2015 11 1103 1420 0 0 19 1103 626476miss3_jump
1 12703750 2015 11 1103 1420 0 0 19 1420 631228reb_def
2 12703751 2015 11 1103 1420 0 0 27 1420 631233assist
3 12703752 2015 11 1103 1420 0 2 27 1420 631230made2_dunk
4 12703753 2015 11 1103 1420 2 2 59 1103 626468made2_jump

Players_2015

Players_2015 = pd.read_csv('../input/datafiles/Players_2015.csv')
print("size = ", Players_2015.shape)
print_df(Players_2015.head())
size = (5442, 4)
PlayerID Season TeamIDPlayerName
0 626432 2015 1101ALBRIGHT_CHRISTIAN
1 626433 2015 1101COOKE_AUSTIN
2 626434 2015 1101GRANT_MICHAEL
3 626435 2015 1101GREEN_DRAKE
4 626436 2015 1101HANSON_DAVID

Data Section 6 - Supplements

ConferenceTourneyGames

Conference Tourney Games = pd.read_csv('../input/datafiles/ConferenceTourneyGames.csv')
print("size = ", Conference Tourney Games.shape)
print_df(Conference Tourney Games.head())
Traceback (most recent call last): File "", line 1, in File "/tmp/babel-7PnAYV/python-KdmXXV", line 1 Conference Tourney Games = pd.read_csv('../input/datafiles/ConferenceTourneyGames.csv') ^ SyntaxError: invalid syntax

TeamCoaches

TeamCoaches = pd.read_csv('../input/datafiles/TeamCoaches.csv')
print("size = ", TeamCoaches.shape)
print_df(TeamCoaches.head())
size = (10994, 5)
Season TeamID FirstDayNum LastDayNumCoachName
0 1985 1102 0 154reggie_minton
1 1985 1103 0 154bob_huggins
2 1985 1104 0 154wimp_sanderson
3 1985 1106 0 154james_oliver
4 1985 1108 0 154davey_whitney

Conferences

Conferences = pd.read_csv('../input/datafiles/Conferences.csv')
print("size = ", Conferences.shape)
print_df(Conferences.head())
size = (51, 2)
ConfAbbrev Description
0a_sun Atlantic Sun Conference
1a_ten Atlantic 10 Conference
2aac American Athletic Conference
3acc Atlantic Coast Conference
4aec America East Conference

TeamConferences

TeamConferences = pd.read_csv('../input/datafiles/TeamConferences.csv')
print("size = ", TeamConferences.shape)
print_df(TeamConferences.head())
size = (11241, 3)
Season TeamIDConfAbbrev
0 1985 1114a_sun
1 1985 1147a_sun
2 1985 1204a_sun
3 1985 1209a_sun
4 1985 1215a_sun

SecondaryTourneyTeams

SecondaryTourneyTeams = pd.read_csv('../input/datafiles/SecondaryTourneyTeams.csv')
print("size = ", SecondaryTourneyTeams.shape)
print_df(SecondaryTourneyTeams.head())
size = (1568, 3)
SeasonSecondaryTourney TeamID
0 1985NIT 1108
1 1985NIT 1133
2 1985NIT 1139
3 1985NIT 1145
4 1985NIT 1151

SecondaryTourneyCompactResults

SecondaryTourneyCompactResults = pd.read_csv('../input/datafiles/SecondaryTourneyCompactResults.csv')
print("size = ", SecondaryTourneyCompactResults.shape)
print_df(SecondaryTourneyCompactResults.head())
size = (1551, 9)
Season DayNum WTeamID WScore LTeamID LScoreWLoc NumOTSecondaryTourney
0 1985 136 1151 67 1155 65H 0NIT
1 1985 136 1153 77 1245 61H 0NIT
2 1985 136 1201 79 1365 76H 0NIT
3 1985 136 1231 79 1139 57H 0NIT
4 1985 136 1249 78 1222 71H 0NIT

NCAATourneySlots

NCAATourneySlots = pd.read_csv('../input/datafiles/NCAATourneySlots.csv')
print("size = ", NCAATourneySlots.shape)
print_df(NCAATourneySlots.head())
size = (2184, 4)
SeasonSlot StrongSeed WeakSeed
0 1985R1W1 W01 W16
1 1985R1W2 W02 W15
2 1985R1W3 W03 W14
3 1985R1W4 W04 W13
4 1985R1W5 W05 W12

NCAATourneySeedRoundSlots

NCAATourneySeedRoundSlots = pd.read_csv('../input/datafiles/NCAATourneySeedRoundSlots.csv')
print("size = ", NCAATourneySeedRoundSlots.shape)
print_df(NCAATourneySeedRoundSlots.head())
size = (720, 5)
Seed GameRoundGameSlot EarlyDayNum LateDayNum
0W01 1R1W1 136 137
1W01 2R2W1 138 139
2W01 3R3W1 143 144
3W01 4R4W1 145 146
4W01 5R5WX 152 152