First step is to clean the data.
import pandas as pd
import numpy as np
file_path = r'F:\NHL Game Data\game.csv' # file path
dataset = pd.read_csv(file_path)
dataset.head()
game_id | season | type | date_time_GMT | away_team_id | home_team_id | away_goals | home_goals | outcome | home_rink_side_start | venue | venue_link | venue_time_zone_id | venue_time_zone_offset | venue_time_zone_tz | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016020045 | 20162017 | R | 2016-10-19T00:30:00Z | 4 | 16 | 4 | 7 | home win REG | right | United Center | /api/v1/venues/null | America/Chicago | -5 | CDT |
1 | 2017020812 | 20172018 | R | 2018-02-07T00:00:00Z | 24 | 7 | 4 | 3 | away win OT | left | KeyBank Center | /api/v1/venues/null | America/New_York | -4 | EDT |
2 | 2015020314 | 20152016 | R | 2015-11-24T01:00:00Z | 21 | 52 | 4 | 1 | away win REG | right | MTS Centre | /api/v1/venues/null | America/Winnipeg | -5 | CDT |
3 | 2015020849 | 20152016 | R | 2016-02-17T00:00:00Z | 52 | 12 | 1 | 2 | home win REG | right | PNC Arena | /api/v1/venues/null | America/New_York | -4 | EDT |
4 | 2017020586 | 20172018 | R | 2017-12-30T03:00:00Z | 20 | 24 | 1 | 2 | home win REG | left | Honda Center | /api/v1/venues/null | America/Los_Angeles | -7 | PDT |
dataset['date_time_GMT']
0 2016-10-19T00:30:00Z 1 2018-02-07T00:00:00Z 2 2015-11-24T01:00:00Z 3 2016-02-17T00:00:00Z 4 2017-12-30T03:00:00Z ... 26300 2019-06-07T00:00:00Z 26301 2019-06-10T00:00:00Z 26302 2019-06-10T00:00:00Z 26303 2019-06-13T00:00:00Z 26304 2019-06-13T00:00:00Z Name: date_time_GMT, Length: 26305, dtype: object
dataset['date_time_GMT']= dataset['date_time_GMT'].map(lambda x: str(x)[:-1]) # Removing the z in data_time_GMT column
dataset['date_time_GMT']
0 2016-10-19T00:30:00 1 2018-02-07T00:00:00 2 2015-11-24T01:00:00 3 2016-02-17T00:00:00 4 2017-12-30T03:00:00 ... 26300 2019-06-07T00:00:00 26301 2019-06-10T00:00:00 26302 2019-06-10T00:00:00 26303 2019-06-13T00:00:00 26304 2019-06-13T00:00:00 Name: date_time_GMT, Length: 26305, dtype: object
np.where(pd.isnull(dataset['home_rink_side_start'])) # Empty cells need to be replace by “NA”
(array([ 102, 190, 200, ..., 26187, 26210, 26212], dtype=int64),)
dataset['home_rink_side_start']=dataset['home_rink_side_start'].where(pd.notnull(dataset['home_rink_side_start']),'NA')
np.where(pd.isnull(dataset['home_rink_side_start']))
(array([], dtype=int64),)
dataset['game_id'].duplicated() # cannot have duplicate values for primary key
0 False 1 False 2 False 3 False 4 False ... 26300 True 26301 False 26302 True 26303 False 26304 True Name: game_id, Length: 26305, dtype: bool
dataset=dataset.drop_duplicates(subset=['game_id'])
dataset.to_csv('game_format.csv', index=False) # writing to csv file
Create MySQL database for NHL Game Data
DROP DATABASE IF EXISTS nhlgamedata;
CREATE DATABASE IF NOT EXISTS nhlgamedata;
USE nhlgamedata;
Next step is to create tables in MySQL database
CREATE TABLE game(
game_id INT NOT NULL,
season VARCHAR(8) NOT NULL,
type ENUM('R','P','A') NOT NULL,
date_time_GMT DATETIME NOT NULL,
away_team_id SMALLINT NOT NULL,
home_team_id SMALLINT NOT NULL,
away_goals TINYINT NOT NULL,
home_goals TINYINT NOT NULL,
outcome VARCHAR(20),
home_rink_side_start ENUM('right','left','NA'),
venue VARCHAR(45),
venue_link VARCHAR(100),
venue_time_zone_id VARCHAR(50),
venue_time_zone_offset TINYINT,
venue_time_zone_tz VARCHAR(3),
PRIMARY KEY (game_id));
Next step is to load the csv file data to MySQL database
LOAD DATA INFILE 'csv file path'
INTO TABLE TableName
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\r\n'
IGNORE 1 ROWS;
file_path=r'F:\NHL Game Data\player_info.csv'
dataset=pd.read_csv(file_path)
dataset.head()
player_id | firstName | lastName | nationality | birthCity | primaryPosition | birthDate | birthStateProvince | height | height_cm | weight | shootsCatches | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 8466148 | Marian | Hossa | SVK | Stará Lubovna | RW | 1979-01-12 00:00:00 | NaN | 6' 1" | 185.42 | 207.0 | L |
1 | 8465058 | Michal | Rozsival | CZE | Vlasim | D | 1978-09-03 01:00:00 | NaN | 6' 1" | 185.42 | 210.0 | R |
2 | 8476906 | Shayne | Gostisbehere | USA | Pembroke Pines | D | 1993-04-20 01:00:00 | FL | 5' 11" | 180.34 | 180.0 | L |
3 | 8466285 | Brian | Campbell | CAN | Strathroy | D | 1979-05-23 01:00:00 | ON | 5' 10" | 177.80 | 192.0 | L |
4 | 8470607 | Brent | Seabrook | CAN | Richmond | D | 1985-04-20 01:00:00 | BC | 6' 3" | 190.50 | 220.0 | R |
dataset[dataset['height'].isna()]
player_id | firstName | lastName | nationality | birthCity | primaryPosition | birthDate | birthStateProvince | height | height_cm | weight | shootsCatches | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
1300 | 8479138 | Scott | Foster | NaN | NaN | G | 1982-01-17 00:00:00 | NaN | NaN | NaN | NaN | NaN |
1342 | 8480718 | Ben | Wexler | NaN | NaN | G | 1994-12-23 00:00:00 | NaN | NaN | NaN | NaN | NaN |
3541 | 8480356 | Kyle | Keyser | NaN | Coral Springs | G | 1999-03-08 00:00:00 | FL | NaN | NaN | NaN | L |
dataset=dataset.dropna(subset=['height']) # remove NaN values
dataset[dataset['height'].isna()]
player_id | firstName | lastName | nationality | birthCity | primaryPosition | birthDate | birthStateProvince | height | height_cm | weight | shootsCatches |
---|
dataset.to_csv('player_info_format.csv', index=False)
CREATE TABLE player_info(
player_id INT NOT NULL,
firstName VARCHAR(40),
lastName VARCHAR(40),
nationality VARCHAR(5),
birthCity VARCHAR(30),
primaryPosition ENUM('RW','D','C','G','LW'),
birthDate DATETIME,
birthStateProvince VARCHAR(10),
height VARCHAR(15),
height_cm DECIMAL(6,3),
weight SMALLINT,
shootsCatches VARCHAR(5),
PRIMARY KEY(player_id));
file_path=r'F:\NHL Game Data\team_info.csv'
dataset = pd.read_csv(file_path)
dataset.head()
team_id | franchiseId | shortName | teamName | abbreviation | link | |
---|---|---|---|---|---|---|
0 | 1 | 23 | New Jersey | Devils | NJD | /api/v1/teams/1 |
1 | 4 | 16 | Philadelphia | Flyers | PHI | /api/v1/teams/4 |
2 | 26 | 14 | Los Angeles | Kings | LAK | /api/v1/teams/26 |
3 | 14 | 31 | Tampa Bay | Lightning | TBL | /api/v1/teams/14 |
4 | 6 | 6 | Boston | Bruins | BOS | /api/v1/teams/6 |
CREATE TABLE team_info(
team_id SMALLINT NOT NULL,
franchiseId TINYINT NOT NULL,
shortName VARCHAR(20),
teamName VARCHAR(25),
abbreviation VARCHAR(3),
link VARCHAR(50),
PRIMARY KEY(team_id));
file_path=r'F:\NHL Game Data\game_goalie_stats.csv'
dataset = pd.read_csv(file_path)
dataset.head()
game_id | player_id | team_id | timeOnIce | assists | goals | pim | shots | saves | powerPlaySaves | shortHandedSaves | evenSaves | shortHandedShotsAgainst | evenShotsAgainst | powerPlayShotsAgainst | decision | savePercentage | powerPlaySavePercentage | evenStrengthSavePercentage | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016020045 | 8473607 | 4 | 1504 | 0 | 0 | 0 | 16 | 12 | 1 | 0 | 11 | 0 | 13 | 3 | NaN | 75.000000 | 33.333333 | 84.615385 |
1 | 2016020045 | 8473461 | 4 | 2011 | 0 | 0 | 0 | 11 | 9 | 1 | 0 | 8 | 0 | 10 | 1 | L | 81.818182 | 100.000000 | 80.000000 |
2 | 2016020045 | 8470645 | 16 | 3598 | 0 | 0 | 0 | 27 | 23 | 2 | 0 | 21 | 0 | 23 | 4 | W | 85.185185 | 50.000000 | 91.304348 |
3 | 2017020812 | 8468011 | 24 | 3696 | 0 | 0 | 0 | 33 | 30 | 1 | 2 | 27 | 3 | 28 | 2 | W | 90.909091 | 50.000000 | 96.428571 |
4 | 2017020812 | 8475215 | 7 | 3539 | 0 | 0 | 0 | 33 | 29 | 4 | 1 | 24 | 1 | 27 | 5 | L | 87.878788 | 80.000000 | 88.888889 |
np.where(pd.isnull(dataset))
(array([ 0, 17, 39, ..., 56638, 56653, 56655], dtype=int64), array([15, 15, 17, ..., 15, 17, 17], dtype=int64))
pd.isnull(dataset.iloc[:,15])
0 True 1 False 2 False 3 False 4 False ... 56651 False 56652 False 56653 False 56654 False 56655 False Name: decision, Length: 56656, dtype: bool
pd.isnull(dataset.iloc[:,17])
0 False 1 False 2 False 3 False 4 False ... 56651 False 56652 False 56653 True 56654 False 56655 True Name: powerPlaySavePercentage, Length: 56656, dtype: bool
dataset.iloc[:,15]=dataset.iloc[:,15].fillna('NA')
dataset.iloc[:,17]=dataset.iloc[:,17].fillna('NA')
dataset.iloc[39:45]
game_id | player_id | team_id | timeOnIce | assists | goals | pim | shots | saves | powerPlaySaves | shortHandedSaves | evenSaves | shortHandedShotsAgainst | evenShotsAgainst | powerPlayShotsAgainst | decision | savePercentage | powerPlaySavePercentage | evenStrengthSavePercentage | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
39 | 2017021124 | 8474152 | 12 | 3600 | 0 | 0 | 0 | 30 | 23 | 0 | 1 | 22 | 1 | 29 | 0 | L | 76.666667 | NA | 75.862069 |
40 | 2015020981 | 8476899 | 5 | 3566 | 0 | 0 | 0 | 18 | 17 | 4 | 0 | 13 | 0 | 14 | 4 | W | 94.444444 | 100 | 92.857143 |
41 | 2015020981 | 8476234 | 1 | 3600 | 0 | 0 | 0 | 30 | 24 | 4 | 3 | 17 | 3 | 22 | 5 | L | 80.000000 | 80 | 77.272727 |
42 | 2017020146 | 8471306 | 2 | 3595 | 0 | 0 | 0 | 33 | 27 | 8 | 2 | 17 | 4 | 21 | 8 | L | 81.818182 | 100 | 80.952381 |
43 | 2017020146 | 8471227 | 30 | 3600 | 0 | 0 | 0 | 33 | 29 | 6 | 0 | 23 | 0 | 27 | 6 | W | 87.878788 | 100 | 85.185185 |
44 | 2016020716 | 8476914 | 29 | 781 | 0 | 0 | 0 | 5 | 5 | 0 | 0 | 5 | 0 | 5 | 0 | NA | 100.000000 | NA | 100.000000 |
dataset.iloc[289:292]
game_id | player_id | team_id | timeOnIce | assists | goals | pim | shots | saves | powerPlaySaves | shortHandedSaves | evenSaves | shortHandedShotsAgainst | evenShotsAgainst | powerPlayShotsAgainst | decision | savePercentage | powerPlaySavePercentage | evenStrengthSavePercentage | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
289 | 2016020163 | 8468011 | 23 | 3292 | 0 | 0 | 22 | 42 | 36 | 5 | 2 | 29 | 2 | 34 | 6 | L | 85.714286 | 83.3333 | 85.294118 |
290 | 2016020163 | 8474593 | 23 | 308 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NA | NaN | NA | NaN |
291 | 2016020163 | 8473523 | 10 | 308 | 0 | 0 | 0 | 3 | 3 | 3 | 0 | 0 | 0 | 0 | 3 | NA | 100.000000 | 100 | NaN |
Need to replace NA and NaN with the 0.0 to match with the datatype in Double
dataset[['savePercentage','powerPlaySavePercentage','evenStrengthSavePercentage']] = dataset[['savePercentage','powerPlaySavePercentage','evenStrengthSavePercentage']].replace('NA',0.0,regex=True)
dataset[['savePercentage','evenStrengthSavePercentage']] = dataset[['savePercentage','evenStrengthSavePercentage']].replace(np.nan,0.0,regex=True)
dataset.iloc[39:45]
game_id | player_id | team_id | timeOnIce | assists | goals | pim | shots | saves | powerPlaySaves | shortHandedSaves | evenSaves | shortHandedShotsAgainst | evenShotsAgainst | powerPlayShotsAgainst | decision | savePercentage | powerPlaySavePercentage | evenStrengthSavePercentage | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
39 | 2017021124 | 8474152 | 12 | 3600 | 0 | 0 | 0 | 30 | 23 | 0 | 1 | 22 | 1 | 29 | 0 | L | 76.666667 | 0.0 | 75.862069 |
40 | 2015020981 | 8476899 | 5 | 3566 | 0 | 0 | 0 | 18 | 17 | 4 | 0 | 13 | 0 | 14 | 4 | W | 94.444444 | 100.0 | 92.857143 |
41 | 2015020981 | 8476234 | 1 | 3600 | 0 | 0 | 0 | 30 | 24 | 4 | 3 | 17 | 3 | 22 | 5 | L | 80.000000 | 80.0 | 77.272727 |
42 | 2017020146 | 8471306 | 2 | 3595 | 0 | 0 | 0 | 33 | 27 | 8 | 2 | 17 | 4 | 21 | 8 | L | 81.818182 | 100.0 | 80.952381 |
43 | 2017020146 | 8471227 | 30 | 3600 | 0 | 0 | 0 | 33 | 29 | 6 | 0 | 23 | 0 | 27 | 6 | W | 87.878788 | 100.0 | 85.185185 |
44 | 2016020716 | 8476914 | 29 | 781 | 0 | 0 | 0 | 5 | 5 | 0 | 0 | 5 | 0 | 5 | 0 | NA | 100.000000 | 0.0 | 100.000000 |
np.where(pd.isnull(dataset['savePercentage']))
(array([], dtype=int64),)
Team_id in game_goalie_stats.csv also should be in team_info.csv as per foreign key constraint.
dataset['team_id'].unique()
array([ 4, 16, 24, 7, 21, 52, 12, 20, 15, 8, 10, 26, 23, 29, 22, 5, 25, 2, 53, 1, 14, 30, 28, 6, 18, 9, 13, 3, 17, 19, 54, 27, 11, 88, 87, 90, 89], dtype=int64)
file_path1=r'F:\NHL Game Data\team_info.csv'
df=pd.read_csv(file_path1)
df['team_id']
0 1 1 4 2 26 3 14 4 6 5 3 6 5 7 17 8 28 9 18 10 23 11 16 12 9 13 8 14 30 15 15 16 19 17 24 18 27 19 2 20 10 21 13 22 7 23 20 24 21 25 25 26 29 27 52 28 22 29 54 30 12 31 53 32 11 Name: team_id, dtype: int64
dataframe=dataset.merge(df,on=['team_id'])
not_in_both=dataset[~dataset.team_id.isin(dataframe.team_id)]
not_in_both
game_id | player_id | team_id | timeOnIce | assists | goals | pim | shots | saves | powerPlaySaves | shortHandedSaves | evenSaves | shortHandedShotsAgainst | evenShotsAgainst | powerPlayShotsAgainst | decision | savePercentage | powerPlaySavePercentage | evenStrengthSavePercentage | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
48937 | 2019040651 | 8477465 | 88 | 570 | 0 | 0 | 0 | 11 | 8 | 0 | 0 | 8 | 0 | 11 | 0 | NA | 72.727273 | 0.0 | 72.727273 |
48938 | 2019040651 | 8474651 | 88 | 600 | 0 | 0 | 0 | 9 | 5 | 0 | 0 | 5 | 0 | 9 | 0 | NA | 55.555556 | 0.0 | 55.555556 |
48939 | 2019040651 | 8476883 | 87 | 600 | 0 | 0 | 0 | 7 | 6 | 0 | 0 | 6 | 0 | 7 | 0 | NA | 85.714286 | 0.0 | 85.714286 |
48940 | 2019040651 | 8475883 | 87 | 600 | 0 | 0 | 0 | 9 | 5 | 0 | 0 | 5 | 0 | 9 | 0 | NA | 55.555556 | 0.0 | 55.555556 |
48941 | 2019040652 | 8479496 | 90 | 600 | 0 | 0 | 0 | 10 | 9 | 0 | 0 | 9 | 0 | 10 | 0 | NA | 90.000000 | 0.0 | 90.000000 |
48942 | 2019040652 | 8474593 | 90 | 600 | 0 | 0 | 0 | 7 | 3 | 0 | 0 | 3 | 0 | 7 | 0 | NA | 42.857143 | 0.0 | 42.857143 |
48943 | 2019040652 | 8476412 | 89 | 600 | 0 | 0 | 0 | 12 | 8 | 0 | 0 | 8 | 0 | 12 | 0 | NA | 66.666667 | 0.0 | 66.666667 |
48944 | 2019040652 | 8476945 | 89 | 600 | 0 | 0 | 0 | 16 | 10 | 0 | 0 | 10 | 0 | 16 | 0 | NA | 62.500000 | 0.0 | 62.500000 |
48945 | 2019040653 | 8476883 | 87 | 559 | 0 | 0 | 0 | 11 | 7 | 0 | 0 | 7 | 0 | 11 | 0 | NA | 63.636364 | 0.0 | 63.636364 |
48946 | 2019040653 | 8475883 | 87 | 600 | 1 | 0 | 0 | 9 | 8 | 0 | 0 | 8 | 0 | 9 | 0 | NA | 88.888889 | 0.0 | 88.888889 |
48947 | 2019040653 | 8479496 | 90 | 600 | 0 | 0 | 0 | 4 | 3 | 0 | 0 | 3 | 0 | 4 | 0 | NA | 75.000000 | 0.0 | 75.000000 |
48948 | 2019040653 | 8474593 | 90 | 600 | 0 | 0 | 0 | 7 | 4 | 0 | 0 | 4 | 0 | 7 | 0 | NA | 57.142857 | 0.0 | 57.142857 |
48949 | 2019040651 | 8477465 | 88 | 570 | 0 | 0 | 0 | 11 | 8 | 0 | 0 | 8 | 0 | 11 | 0 | NA | 72.727273 | 0.0 | 72.727273 |
48950 | 2019040651 | 8474651 | 88 | 600 | 0 | 0 | 0 | 9 | 5 | 0 | 0 | 5 | 0 | 9 | 0 | NA | 55.555556 | 0.0 | 55.555556 |
48951 | 2019040651 | 8476883 | 87 | 600 | 0 | 0 | 0 | 7 | 6 | 0 | 0 | 6 | 0 | 7 | 0 | NA | 85.714286 | 0.0 | 85.714286 |
48952 | 2019040651 | 8475883 | 87 | 600 | 0 | 0 | 0 | 9 | 5 | 0 | 0 | 5 | 0 | 9 | 0 | NA | 55.555556 | 0.0 | 55.555556 |
48953 | 2019040652 | 8479496 | 90 | 600 | 0 | 0 | 0 | 10 | 9 | 0 | 0 | 9 | 0 | 10 | 0 | NA | 90.000000 | 0.0 | 90.000000 |
48954 | 2019040652 | 8474593 | 90 | 600 | 0 | 0 | 0 | 7 | 3 | 0 | 0 | 3 | 0 | 7 | 0 | NA | 42.857143 | 0.0 | 42.857143 |
48955 | 2019040652 | 8476412 | 89 | 600 | 0 | 0 | 0 | 12 | 8 | 0 | 0 | 8 | 0 | 12 | 0 | NA | 66.666667 | 0.0 | 66.666667 |
48956 | 2019040652 | 8476945 | 89 | 600 | 0 | 0 | 0 | 16 | 10 | 0 | 0 | 10 | 0 | 16 | 0 | NA | 62.500000 | 0.0 | 62.500000 |
48957 | 2019040653 | 8476883 | 87 | 559 | 0 | 0 | 0 | 11 | 7 | 0 | 0 | 7 | 0 | 11 | 0 | NA | 63.636364 | 0.0 | 63.636364 |
48958 | 2019040653 | 8475883 | 87 | 600 | 1 | 0 | 0 | 9 | 8 | 0 | 0 | 8 | 0 | 9 | 0 | NA | 88.888889 | 0.0 | 88.888889 |
48959 | 2019040653 | 8479496 | 90 | 600 | 0 | 0 | 0 | 4 | 3 | 0 | 0 | 3 | 0 | 4 | 0 | NA | 75.000000 | 0.0 | 75.000000 |
48960 | 2019040653 | 8474593 | 90 | 600 | 0 | 0 | 0 | 7 | 4 | 0 | 0 | 4 | 0 | 7 | 0 | NA | 57.142857 | 0.0 | 57.142857 |
54148 | 2018040642 | 8474651 | 88 | 600 | 0 | 0 | 0 | 13 | 11 | 0 | 0 | 11 | 0 | 13 | 0 | NA | 84.615385 | 0.0 | 84.615385 |
54149 | 2018040642 | 8468685 | 88 | 600 | 0 | 0 | 0 | 7 | 5 | 0 | 0 | 5 | 0 | 7 | 0 | NA | 71.428571 | 0.0 | 71.428571 |
54150 | 2018040642 | 8476883 | 87 | 600 | 0 | 0 | 0 | 13 | 10 | 0 | 0 | 10 | 0 | 13 | 0 | NA | 76.923077 | 0.0 | 76.923077 |
54151 | 2018040642 | 8470657 | 87 | 600 | 0 | 0 | 0 | 13 | 9 | 0 | 0 | 9 | 0 | 13 | 0 | NA | 69.230769 | 0.0 | 69.230769 |
54152 | 2018040643 | 8474651 | 88 | 600 | 0 | 0 | 0 | 17 | 12 | 0 | 0 | 12 | 0 | 17 | 0 | NA | 70.588235 | 0.0 | 70.588235 |
54153 | 2018040643 | 8468685 | 88 | 600 | 0 | 0 | 0 | 6 | 6 | 0 | 0 | 6 | 0 | 6 | 0 | NA | 100.000000 | 0.0 | 100.000000 |
54154 | 2018040643 | 8471469 | 89 | 583 | 0 | 0 | 0 | 10 | 6 | 0 | 0 | 6 | 0 | 10 | 0 | NA | 60.000000 | 0.0 | 60.000000 |
54155 | 2018040643 | 8471227 | 89 | 600 | 0 | 0 | 0 | 11 | 6 | 0 | 0 | 6 | 0 | 11 | 0 | NA | 54.545455 | 0.0 | 54.545455 |
54156 | 2018040642 | 8474651 | 88 | 600 | 0 | 0 | 0 | 13 | 11 | 0 | 0 | 11 | 0 | 13 | 0 | NA | 84.615385 | 0.0 | 84.615385 |
54157 | 2018040642 | 8468685 | 88 | 600 | 0 | 0 | 0 | 7 | 5 | 0 | 0 | 5 | 0 | 7 | 0 | NA | 71.428571 | 0.0 | 71.428571 |
54158 | 2018040642 | 8476883 | 87 | 600 | 0 | 0 | 0 | 13 | 10 | 0 | 0 | 10 | 0 | 13 | 0 | NA | 76.923077 | 0.0 | 76.923077 |
54159 | 2018040642 | 8470657 | 87 | 600 | 0 | 0 | 0 | 13 | 9 | 0 | 0 | 9 | 0 | 13 | 0 | NA | 69.230769 | 0.0 | 69.230769 |
54160 | 2018040643 | 8474651 | 88 | 600 | 0 | 0 | 0 | 17 | 12 | 0 | 0 | 12 | 0 | 17 | 0 | NA | 70.588235 | 0.0 | 70.588235 |
54161 | 2018040643 | 8468685 | 88 | 600 | 0 | 0 | 0 | 6 | 6 | 0 | 0 | 6 | 0 | 6 | 0 | NA | 100.000000 | 0.0 | 100.000000 |
54162 | 2018040643 | 8471469 | 89 | 583 | 0 | 0 | 0 | 10 | 6 | 0 | 0 | 6 | 0 | 10 | 0 | NA | 60.000000 | 0.0 | 60.000000 |
54163 | 2018040643 | 8471227 | 89 | 600 | 0 | 0 | 0 | 11 | 6 | 0 | 0 | 6 | 0 | 11 | 0 | NA | 54.545455 | 0.0 | 54.545455 |
dataset=dataset.drop(dataset.index[not_in_both.index])
dataset['team_id'].unique()
array([ 4, 16, 24, 7, 21, 52, 12, 20, 15, 8, 10, 26, 23, 29, 22, 5, 25, 2, 53, 1, 14, 30, 28, 6, 18, 9, 13, 3, 17, 19, 54, 27, 11], dtype=int64)
player_id in game_goalie_stats.csv also should be in team_info_format.csv as per foreign key constraint.
file_path2=r'C:\ProgramData\MySQL\MySQL Server 8.0\Uploads\player_info_format.csv'
df1=pd.read_csv(file_path2)
dataframe1=dataset.merge(df1,on=['player_id'])
not_in_both1=dataset[~dataset.player_id.isin(dataframe1.player_id)]
not_in_both1
game_id | player_id | team_id | timeOnIce | assists | goals | pim | shots | saves | powerPlaySaves | shortHandedSaves | evenSaves | shortHandedShotsAgainst | evenShotsAgainst | powerPlayShotsAgainst | decision | savePercentage | powerPlaySavePercentage | evenStrengthSavePercentage | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2582 | 2017021196 | 8479138 | 16 | 841 | 0 | 0 | 0 | 7 | 7 | 1 | 0 | 6 | 0 | 6 | 1 | NA | 100.0 | 100.0 | 100.0 |
dataset=dataset.drop(dataset.index[not_in_both1.index])
Remove duplicate values base on all three columns
dataset = dataset.drop_duplicates(subset=['game_id','player_id','team_id'])
dataset.to_csv('game_goalie_stats_format.csv', index=False)
CREATE TABLE game_goalie_stats(
game_id INT NOT NULL,
player_id INT NOT NULL,
team_id SMALLINT NOT NULL,
timeOnIce SMALLINT NOT NULL,
assists TINYINT,
goals TINYINT,
pim TINYINT,
shots TINYINT,
saves TINYINT,
powerPlaySaves TINYINT,
shortHandedSaves TINYINT,
evenSaves TINYINT,
shortHandedShotsAgainst TINYINT,
evenShotsAgainst TINYINT,
powerPlayShotsAgainst TINYINT,
decision ENUM('W','L','NA'),
savePercentage DOUBLE,
powerPlaySavePercentage DOUBLE,
evenStrengthSavePercentage DOUBLE,
FOREIGN KEY(game_id) REFERENCES game (game_id) ON DELETE CASCADE,
FOREIGN KEY(player_id) REFERENCES player_info (player_id) ON DELETE CASCADE,
FOREIGN KEY(team_id) REFERENCES team_info (team_id) ON DELETE CASCADE,
PRIMARY KEY(game_id,player_id,team_id));
file_path=r'F:\NHL Game Data\game_plays.csv'
dataset= pd.read_csv(file_path)
dataset.head()
play_id | game_id | team_id_for | team_id_against | event | secondaryType | x | y | period | periodType | periodTime | periodTimeRemaining | dateTime | goals_away | goals_home | description | st_x | st_y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016020045_1 | 2016020045 | NaN | NaN | Game Scheduled | NaN | NaN | NaN | 1 | REGULAR | 0 | 1200.0 | 2016-10-18 23:40:58 | 0 | 0 | Game Scheduled | NaN | NaN |
1 | 2016020045_2 | 2016020045 | NaN | NaN | Period Ready | NaN | NaN | NaN | 1 | REGULAR | 0 | 1200.0 | 2016-10-19 01:35:28 | 0 | 0 | Period Ready | NaN | NaN |
2 | 2016020045_3 | 2016020045 | NaN | NaN | Period Start | NaN | NaN | NaN | 1 | REGULAR | 0 | 1200.0 | 2016-10-19 01:40:50 | 0 | 0 | Period Start | NaN | NaN |
3 | 2016020045_4 | 2016020045 | 16.0 | 4.0 | Faceoff | NaN | 0.0 | 0.0 | 1 | REGULAR | 0 | 1200.0 | 2016-10-19 01:40:50 | 0 | 0 | Jonathan Toews faceoff won against Claude Giroux | 0.0 | 0.0 |
4 | 2016020045_5 | 2016020045 | 16.0 | 4.0 | Shot | Wrist Shot | -71.0 | 9.0 | 1 | REGULAR | 54 | 1146.0 | 2016-10-19 01:41:44 | 0 | 0 | Artem Anisimov Wrist Shot saved by Michal Neuv... | 71.0 | -9.0 |
dataset.iloc[2863335:2863341]
play_id | game_id | team_id_for | team_id_against | event | secondaryType | x | y | period | periodType | periodTime | periodTimeRemaining | dateTime | goals_away | goals_home | description | st_x | st_y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2863335 | 2010020061_81 | 2010020061 | NaN | NaN | Period Official | NaN | NaN | NaN | 1 | REGULAR | 1200 | 0.0 | 2010-10-17 01:41:47 | 1 | 0 | Period Official | NaN | NaN |
2863336 | 2010020061_82 | 2010020061 | NaN | NaN | Period Ready | NaN | NaN | NaN | 2 | REGULAR | 0 | 1200.0 | 2010-10-17 01:59:03 | 1 | 0 | Period Ready | NaN | NaN |
2863337 | 2010020061_83 | 2010020061 | NaN | NaN | Period Start | NaN | NaN | NaN | 2 | REGULAR | 0 | 1200.0 | 2010-10-17 01:59:15 | 1 | 0 | Period Start | NaN | NaN |
2863338 | 2008020886_16 | 2008020886 | 22.0 | 20.0 | Missed Shot | NaN | NaN | NaN | 5 | SHOOTOUT | 0 | NaN | 2009-02-22 03:00:00 | 2 | 2 | Sam Gagner | NaN | NaN |
2863339 | 2010020061_84 | 2010020061 | 29.0 | 30.0 | Faceoff | NaN | 0.0 | 0.0 | 2 | REGULAR | 0 | 1200.0 | 2010-10-17 01:59:17 | 1 | 0 | Antoine Vermette faceoff won against Mikko Koivu | 0.0 | 0.0 |
2863340 | 2010020061_85 | 2010020061 | 30.0 | 29.0 | Missed Shot | NaN | 45.0 | 27.0 | 2 | REGULAR | 27 | 1173.0 | 2010-10-17 01:59:46 | 1 | 0 | Greg Zanon - Wide of Net | 45.0 | 27.0 |
np.where(pd.isnull(dataset['periodTimeRemaining']))
(array([2863338, 2863380, 2922266, ..., 3131960, 3131961, 3131962], dtype=int64),)
Replace all the NaN values with 0.0
dataset['periodTimeRemaining'] = dataset['periodTimeRemaining'].replace(np.nan,0.0,regex=True)
np.where(pd.isnull(dataset['periodTimeRemaining']))
(array([], dtype=int64),)
remove duplicate values in primary key
df.to_csv('C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/game_plays_format.csv',index=False) # folder path can be provide to create csv
CREATE TABLE game_plays(
play_id VARCHAR(16) NOT NULL,
game_id INT NOT NULL,
team_id_for VARCHAR(10),
team_id_against VARCHAR(10),
event VARCHAR(30),
secondaryType VARCHAR(40),
x VARCHAR(6),
y VARCHAR(6),
period TINYINT,
periodType ENUM('REGULAR','OVERTIME','SHOOTOUT'),
periodTime SMALLINT,
periodTimeRemaining SMALLINT,
date_Time DATETIME,
goals_away TINYINT,
goals_home TINYINT,
description TEXT,
st_x VARCHAR(6),
st_y VARCHAR(6),
FOREIGN KEY (game_id) REFERENCES game (game_id) ON DELETE CASCADE,
PRIMARY KEY(play_id,game_id ));
file_path=r'F:\NHL Game Data\game_teams_stats.csv'
dataset= pd.read_csv(file_path)
dataset.head()
game_id | team_id | HoA | won | settled_in | head_coach | goals | shots | hits | pim | powerPlayOpportunities | powerPlayGoals | faceOffWinPercentage | giveaways | takeaways | blocked | startRinkSide | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016020045 | 4 | away | False | REG | Dave Hakstol | 4.0 | 27.0 | 30.0 | 6.0 | 4.0 | 2.0 | 50.9 | 12.0 | 9.0 | 11.0 | left |
1 | 2016020045 | 16 | home | True | REG | Joel Quenneville | 7.0 | 28.0 | 20.0 | 8.0 | 3.0 | 2.0 | 49.1 | 16.0 | 8.0 | 9.0 | left |
2 | 2017020812 | 24 | away | True | OT | Randy Carlyle | 4.0 | 34.0 | 16.0 | 6.0 | 3.0 | 1.0 | 43.8 | 7.0 | 4.0 | 14.0 | right |
3 | 2017020812 | 7 | home | False | OT | Phil Housley | 3.0 | 33.0 | 17.0 | 8.0 | 2.0 | 1.0 | 56.2 | 5.0 | 6.0 | 14.0 | right |
4 | 2015020314 | 21 | away | True | REG | Patrick Roy | 4.0 | 29.0 | 17.0 | 9.0 | 3.0 | 1.0 | 45.7 | 13.0 | 5.0 | 20.0 | left |
dataset.iloc[40904:40910]
game_id | team_id | HoA | won | settled_in | head_coach | goals | shots | hits | pim | powerPlayOpportunities | powerPlayGoals | faceOffWinPercentage | giveaways | takeaways | blocked | startRinkSide | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
40904 | 2017030185 | 28 | away | False | tbc | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
40905 | 2017030185 | 24 | home | False | tbc | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
40906 | 2017030311 | 15 | away | True | REG | Barry Trotz | 4.0 | 32.0 | 15.0 | 6.0 | 4.0 | 2.0 | 54.0 | 4.0 | 6.0 | 19.0 | right |
40907 | 2017030311 | 14 | home | False | REG | Jon Cooper | 2.0 | 21.0 | 27.0 | 8.0 | 3.0 | 1.0 | 46.0 | 8.0 | 8.0 | 10.0 | right |
40908 | 2017030312 | 15 | away | True | REG | Barry Trotz | 6.0 | 37.0 | 38.0 | 14.0 | 3.0 | 1.0 | 43.8 | 6.0 | 4.0 | 16.0 | right |
40909 | 2017030312 | 14 | home | False | REG | Jon Cooper | 2.0 | 35.0 | 33.0 | 12.0 | 4.0 | 2.0 | 56.2 | 6.0 | 8.0 | 10.0 | right |
np.where(pd.isnull(dataset['goals']))
(array([40904, 40905, 40998, 40999, 41000, 41001, 41770, 41771], dtype=int64),)
dataset=dataset.dropna(thresh=dataset.shape[1]-5, axis=0)
dataset.iloc[40904:40910]
game_id | team_id | HoA | won | settled_in | head_coach | goals | shots | hits | pim | powerPlayOpportunities | powerPlayGoals | faceOffWinPercentage | giveaways | takeaways | blocked | startRinkSide | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
41238 | 2013030326 | 16 | away | True | REG | Joel Quenneville | 4.0 | 25.0 | 35.0 | 4.0 | 2.0 | 1.0 | 50.8 | 16.0 | 2.0 | 19.0 | left |
41239 | 2013030326 | 26 | home | False | REG | Darryl Sutter | 3.0 | 29.0 | 48.0 | 4.0 | 2.0 | 1.0 | 49.2 | 17.0 | 2.0 | 11.0 | left |
41240 | 2013030327 | 26 | away | True | OT | Darryl Sutter | 5.0 | 32.0 | 43.0 | 10.0 | 2.0 | 0.0 | 47.4 | 8.0 | 4.0 | 12.0 | left |
41241 | 2013030327 | 16 | home | False | OT | Joel Quenneville | 4.0 | 41.0 | 28.0 | 4.0 | 5.0 | 2.0 | 52.6 | 8.0 | 5.0 | 23.0 | left |
41242 | 2015030221 | 5 | away | False | OT | Mike Sullivan | 3.0 | 45.0 | 29.0 | 10.0 | 2.0 | 0.0 | 54.1 | 5.0 | 7.0 | 27.0 | left |
41243 | 2015030221 | 15 | home | True | OT | Barry Trotz | 4.0 | 35.0 | 43.0 | 6.0 | 4.0 | 0.0 | 45.9 | 8.0 | 5.0 | 16.0 | left |
np.where(pd.isnull(dataset['goals']))
(array([], dtype=int64),)
dataset.iloc[205:210]
game_id | team_id | HoA | won | settled_in | head_coach | goals | shots | hits | pim | powerPlayOpportunities | powerPlayGoals | faceOffWinPercentage | giveaways | takeaways | blocked | startRinkSide | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
205 | 2017020562 | 5 | home | True | OT | Mike Sullivan | 4.0 | 37.0 | 36.0 | 6.0 | 4.0 | 2.0 | 59.7 | 8.0 | 7.0 | 16.0 | right |
206 | 2017021081 | 1 | away | True | REG | John Hynes | 8.0 | 28.0 | 34.0 | 10.0 | 2.0 | 1.0 | 51.5 | 9.0 | 2.0 | 26.0 | NaN |
207 | 2017021081 | 54 | home | False | REG | Gerard Gallant | 3.0 | 42.0 | 28.0 | 4.0 | 5.0 | 2.0 | 48.5 | 10.0 | 10.0 | 9.0 | NaN |
208 | 2017020955 | 52 | away | True | REG | Paul Maurice | 5.0 | 27.0 | 17.0 | 8.0 | 2.0 | 1.0 | 57.3 | 8.0 | 10.0 | 21.0 | left |
209 | 2017020955 | 25 | home | False | REG | Ken Hitchcock | 3.0 | 36.0 | 25.0 | 4.0 | 4.0 | 0.0 | 42.7 | 7.0 | 5.0 | 13.0 | left |
dataset['startRinkSide'] = dataset['startRinkSide'].replace(np.nan,'NA',regex=True)
dataset.iloc[205:210]
game_id | team_id | HoA | won | settled_in | head_coach | goals | shots | hits | pim | powerPlayOpportunities | powerPlayGoals | faceOffWinPercentage | giveaways | takeaways | blocked | startRinkSide | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
205 | 2017020562 | 5 | home | True | OT | Mike Sullivan | 4.0 | 37.0 | 36.0 | 6.0 | 4.0 | 2.0 | 59.7 | 8.0 | 7.0 | 16.0 | right |
206 | 2017021081 | 1 | away | True | REG | John Hynes | 8.0 | 28.0 | 34.0 | 10.0 | 2.0 | 1.0 | 51.5 | 9.0 | 2.0 | 26.0 | NA |
207 | 2017021081 | 54 | home | False | REG | Gerard Gallant | 3.0 | 42.0 | 28.0 | 4.0 | 5.0 | 2.0 | 48.5 | 10.0 | 10.0 | 9.0 | NA |
208 | 2017020955 | 52 | away | True | REG | Paul Maurice | 5.0 | 27.0 | 17.0 | 8.0 | 2.0 | 1.0 | 57.3 | 8.0 | 10.0 | 21.0 | left |
209 | 2017020955 | 25 | home | False | REG | Ken Hitchcock | 3.0 | 36.0 | 25.0 | 4.0 | 4.0 | 0.0 | 42.7 | 7.0 | 5.0 | 13.0 | left |
dataset['team_id'].unique()
array([ 4, 16, 24, 7, 21, 52, 12, 20, 15, 8, 10, 26, 23, 29, 22, 5, 25, 2, 53, 1, 14, 30, 28, 6, 18, 9, 13, 3, 17, 19, 54, 11, 27, 88, 87, 90, 89], dtype=int64)
file_path1=r'F:\NHL Game Data\team_info.csv'
df=pd.read_csv(file_path1)
dataframe=dataset.merge(df,on=['team_id'])
not_in_both=dataset[~dataset.team_id.isin(dataframe.team_id)]
not_in_both['team_id'].unique()
array([88, 87, 90, 89], dtype=int64)
dataset=dataset.drop(dataset.index[dataset['team_id']==87])
dataset=dataset.drop(dataset.index[dataset['team_id']==88])
dataset=dataset.drop(dataset.index[dataset['team_id']==89])
dataset=dataset.drop(dataset.index[dataset['team_id']==90])
dataset['team_id'].unique()
array([ 4, 16, 24, 7, 21, 52, 12, 20, 15, 8, 10, 26, 23, 29, 22, 5, 25, 2, 53, 1, 14, 30, 28, 6, 18, 9, 13, 3, 17, 19, 54, 11, 27], dtype=int64)
taset=dataset.drop_duplicates(subset=['game_id','team_id'])
dataset.to_csv('C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/game_teams_stats_format.csv',index=False)
CREATE TABLE game_teams_stats(
game_id INT NOT NULL,
team_id SMALLINT NOT NULL,
HoA ENUM('away','home'),
won ENUM('FALSE','TRUE'),
settled_in ENUM('REG','OT','tbc'),
head_coach VARCHAR(30),
goals TINYINT,
shots TINYINT,
hits VARCHAR(4),
pim SMALLINT,
powerPlayOpportunities TINYINT,
powerPlayGoals TINYINT,
faceOffWinPercentage VARCHAR(4),
giveaways VARCHAR(4),
takeaways VARCHAR(4),
blocked VARCHAR(4),
startRinkSide ENUM('left','right','NA'),
FOREIGN KEY(game_id) REFERENCES game (game_id) ON DELETE CASCADE,
FOREIGN KEY(team_id) REFERENCES team_info (team_id) ON DELETE CASCADE,
PRIMARY KEY(game_id,team_id));
file_path=r'F:\NHL Game Data\game_skater_stats.csv'
dataset=pd.read_csv(file_path)
dataset.head()
game_id | player_id | team_id | timeOnIce | assists | goals | shots | hits | powerPlayGoals | powerPlayAssists | ... | faceoffTaken | takeaways | giveaways | shortHandedGoals | shortHandedAssists | blocked | plusMinus | evenTimeOnIce | shortHandedTimeOnIce | powerPlayTimeOnIce | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016020045 | 8468513 | 4 | 955 | 1 | 0 | 0 | 2.0 | 0 | 0 | ... | 0 | 1.0 | 1.0 | 0 | 0 | 1.0 | 1 | 858 | 97 | 0 |
1 | 2016020045 | 8476906 | 4 | 1396 | 1 | 0 | 4 | 2.0 | 0 | 0 | ... | 0 | 1.0 | 2.0 | 0 | 0 | 2.0 | 0 | 1177 | 0 | 219 |
2 | 2016020045 | 8474668 | 4 | 915 | 0 | 0 | 1 | 1.0 | 0 | 0 | ... | 0 | 2.0 | 0.0 | 0 | 0 | 0.0 | -1 | 805 | 0 | 110 |
3 | 2016020045 | 8473512 | 4 | 1367 | 3 | 0 | 0 | 0.0 | 0 | 2 | ... | 27 | 0.0 | 0.0 | 0 | 0 | 0.0 | -1 | 1083 | 19 | 265 |
4 | 2016020045 | 8471762 | 4 | 676 | 0 | 0 | 3 | 2.0 | 0 | 0 | ... | 0 | 0.0 | 1.0 | 0 | 0 | 0.0 | -1 | 613 | 63 | 0 |
5 rows × 22 columns
dataset['team_id'].unique()
array([ 4, 16, 24, 7, 21, 52, 12, 20, 15, 8, 10, 26, 23, 29, 22, 5, 25, 2, 53, 1, 14, 30, 28, 6, 18, 9, 13, 3, 17, 19, 54, 27, 11, 88, 87, 90, 89], dtype=int64)
file_path1=r'F:\NHL Game Data\team_info.csv'
df=pd.read_csv(file_path1)
dataframe=dataset.merge(df,on=['team_id'])
not_in_both=dataset[~dataset.team_id.isin(dataframe.team_id)]
dataset=dataset.drop(dataset.index[not_in_both.index])
dataset['team_id'].unique()
array([ 4, 16, 24, 7, 21, 52, 12, 20, 15, 8, 10, 26, 23, 29, 22, 5, 25, 2, 53, 1, 14, 30, 28, 6, 18, 9, 13, 3, 17, 19, 54, 27, 11], dtype=int64)
dataset=dataset.drop_duplicates(subset=['game_id','player_id','team_id'])
dataset.to_csv('C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/game_skater_stats_format.csv',index=False)
CREATE TABLE game_skater_stats(
game_id INT NOT NULL,
player_id INT NOT NULL,
team_id SMALLINT NOT NULL,
timeOnIce SMALLINT NOT NULL,
assists TINYINT,
goals TINYINT,
shots TINYINT,
hits VARCHAR(4),
powerPlayGoals TINYINT,
powerPlayAssists TINYINT,
penaltyMinutes TINYINT,
faceOffWins TINYINT,
faceoffTaken TINYINT,
takeaways VARCHAR(4),
giveaways VARCHAR(4),
shortHandedGoals TINYINT,
shortHandedAssists TINYINT,
blocked VARCHAR(4),
plusMinus TINYINT,
evenTimeOnIce SMALLINT,
shortHandedTimeOnIce SMALLINT,
powerPlayTimeOnIce SMALLINT,
FOREIGN KEY(game_id) REFERENCES game (game_id) ON DELETE CASCADE,
FOREIGN KEY(player_id) REFERENCES player_info (player_id) ON DELETE CASCADE,
FOREIGN KEY(team_id) REFERENCES team_info (team_id) ON DELETE CASCADE,
PRIMARY KEY(game_id,player_id,team_id));
file_path=r'F:\NHL Game Data\game_goals.csv'
dataset=pd.read_csv(file_path)
dataset.head()
play_id | strength | gameWinningGoal | emptyNet | |
---|---|---|---|---|
0 | 2016020045_6 | Even | False | False |
1 | 2016020045_97 | Even | False | False |
2 | 2016020045_103 | Power Play | False | False |
3 | 2016020045_140 | Power Play | False | False |
4 | 2016020045_197 | Power Play | False | False |
dataset.iloc[52310:52319]
play_id | strength | gameWinningGoal | emptyNet | |
---|---|---|---|---|
52310 | 2009020690_13 | Even | False | False |
52311 | 2009020690_21 | Even | NaN | NaN |
52312 | 2009020690_22 | Even | NaN | NaN |
52313 | 2009020690_23 | Even | NaN | NaN |
52314 | 2009020690_24 | Even | NaN | NaN |
52315 | 2009020690_26 | Even | NaN | NaN |
52316 | 2009020336_3 | Even | True | False |
52317 | 2009020336_4 | Even | False | False |
52318 | 2009020336_5 | Even | False | False |
dataset[['gameWinningGoal','emptyNet']] = dataset[['gameWinningGoal','emptyNet']].replace(np.nan,'NA',regex=True)
dataset.iloc[52310:52319]
play_id | strength | gameWinningGoal | emptyNet | |
---|---|---|---|---|
52310 | 2009020690_13 | Even | False | False |
52311 | 2009020690_21 | Even | NA | NA |
52312 | 2009020690_22 | Even | NA | NA |
52313 | 2009020690_23 | Even | NA | NA |
52314 | 2009020690_24 | Even | NA | NA |
52315 | 2009020690_26 | Even | NA | NA |
52316 | 2009020336_3 | Even | True | False |
52317 | 2009020336_4 | Even | False | False |
52318 | 2009020336_5 | Even | False | False |
dataset=dataset.drop_duplicates(subset=['play_id'])
dataset.to_csv('C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/game_goals_format.csv',index=False)
CREATE TABLE game_goals(
play_id VARCHAR(16) NOT NULL,
strength ENUM('Even','Power Play','Short Handed'),
gameWinningGoal ENUM('FALSE','TRUE','NA'),
emptyNet ENUM('FALSE','TRUE','NA'),
FOREIGN KEY (play_id) REFERENCES game_plays (play_id) ON DELETE CASCADE,
PRIMARY KEY(play_id));