#defines the import packages that will be used in this tutorial

from google.colab import drive # imports the drive module from Google Colab, allowing us to access and mount Google Drive files within the Colab environment
import matplotlib.pyplot as plt # imports the pyplot module from matplotlib in order to plot various charts
import numpy as np # Imports numpy, a library used for numerical operations and working with arrays.
import pandas as pd # Imports pandas, a powerful data manipulation and analysis library.
import statsmodels.formula.api # provides functions to create statistical models and conduct hypothesis tests using formulas.
import seaborn as sns # Imports seaborn, a statistical data visualization library based on matplotlib. It provides a higher-level interface and attractive visualizations.
import statsmodels.api as sm # Imports statsmodels package for statistical model functions

# customizes the display settings for the pandas library.
pd.set_option('display.width', 3500)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# mounts Google Drive to the /content/drive directory of the Colab environment, allowing us to access content at that location.
drive.mount('/content/drive')


#sets the filepaths for the various tables that we will be using
# The main dataset is premier-league-tables.csv
# there are additional datasets for each season from 2010/11 to 2019/20, detailing goals and shots for each matchday.
# the last dataset adds fouls, red card, yellow card, corners

orig = '/content/drive/MyDrive/premier-league-tables.csv' # basic data

# goals and shots for each matchday.
epl1011 = '/content/drive/MyDrive/epldat10seasons/epl1011matchday-goals-shots.csv'
epl1112 = '/content/drive/MyDrive/epldat10seasons/epl1112matchday-goals-shots.csv'
epl1213 = '/content/drive/MyDrive/epldat10seasons/epl1213matchday-goals-shots.csv'
epl1314 = '/content/drive/MyDrive/epldat10seasons/epl1314matchday-goals-shots.csv'
epl1415 = '/content/drive/MyDrive/epldat10seasons/epl1415matchday-goals-shots.csv'
epl1516 = '/content/drive/MyDrive/epldat10seasons/epl1516matchday-goals-shots.csv'
epl1617 = '/content/drive/MyDrive/epldat10seasons/epl1617matchday-goals-shots.csv'
epl1718 = '/content/drive/MyDrive/epldat10seasons/epl1718matchday-goals-shots.csv'
epl1819 = '/content/drive/MyDrive/epldat10seasons/epl1819matchday-goals-shots.csv'
epl1920 = '/content/drive/MyDrive/epldat10seasons/epl1920matchday-goals-shots.csv'

epl10season = '/content/drive/MyDrive/epldat10seasons/epl-allseasons-matchstats.csv' # fouls, red card, yellow card, corners

# The main dataset of basic statistics (orig) is loaded into the pandas df DataFrame.
df = pd.read_csv(orig, sep=',')




# The datasets of match data for each season are loaded into separate DataFrames (df11, df12, ..., df20).
df11 = pd.read_csv(epl1011, sep=',')
df12 = pd.read_csv(epl1112, sep=',')
df13 = pd.read_csv(epl1213, sep=',')
df14 = pd.read_csv(epl1314, sep=',')
df15 = pd.read_csv(epl1415, sep=',')
df16 = pd.read_csv(epl1516, sep=',')
df17 = pd.read_csv(epl1617, sep=',')
df18 = pd.read_csv(epl1718, sep=',')
df19 = pd.read_csv(epl1819, sep=',')
df20 = pd.read_csv(epl1920, sep=',')

# creates a dataframe for the fouls,cards, and corners data
temp_df = pd.read_csv(epl10season, sep=',')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

# The main dataset df is filtered to retain only the rows corresponding to seasons of 2010-2011 to 2019-2020.
years_to_keep = [2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]
filtered_df = df[df['Season_End_Year'].isin(years_to_keep)]

# The 'Notes' column is removed as it is not needed
filtered_df = filtered_df.drop(columns=['Notes'])

# View and double check the filtered data
print(filtered_df.head())
print("")

     Season_End_Year             Team  Rk  MP   W   D   L  GF  GA  GD  Pts
366             2011          Arsenal   4  38  19  11   8  72  43  29   68
367             2011      Aston Villa   9  38  12  12  14  48  59 -11   48
368             2011  Birmingham City  18  38   8  15  15  37  58 -21   39
369             2011        Blackburn  15  38  11  10  17  46  59 -13   43
370             2011        Blackpool  19  38  10   9  19  55  78 -23   39

# For each individual season dataset (df11, df12, etc.), we add a new column indicating the season end year.
df11['Season_End_Year'] = 2011
df12['Season_End_Year'] = 2012
df13['Season_End_Year'] = 2013
df14['Season_End_Year'] = 2014
df15['Season_End_Year'] = 2015
df16['Season_End_Year'] = 2016
df17['Season_End_Year'] = 2017
df18['Season_End_Year'] = 2018
df19['Season_End_Year'] = 2019
df20['Season_End_Year'] = 2020


# Combine all season datasets into a single DataFrame 'combined_df'
# This will make it easier to deal with seasonal match data regarding goals and shots for each team
frames = [df11, df12, df13, df14, df15, df16, df17, df18, df19, df20]
combined_df = pd.concat(frames, ignore_index=True)

# The filtered_df and combined_df have variations of the same column/value types which makes it hard to analyze them later in case of merging or grouping
# For example, combined_df refers to teams as 'Clubs' and filtered_df refers to teams as 'Team'
# In addition, combined_df refers to certain teams with different names such as using 'Birmingham' instead of 'Birmingham City' as used in filtered_df
# Thus, we must standardize the columns and values for each table by replacing alternative/short names with full/shared names to maintain consistency.
combined_df = combined_df.rename(columns={"Club": "Team"})
combined_df['Team'] = combined_df['Team'].replace('Birmingham', 'Birmingham City')
combined_df['Team'] = combined_df['Team'].replace('Man City', 'Manchester City')
combined_df['Team'] = combined_df['Team'].replace('Man Utd', 'Manchester Utd')
combined_df['Team'] = combined_df['Team'].replace('Man Utd', 'Manchester Utd')
combined_df['Team'] = combined_df['Team'].replace('Newcastle', 'Newcastle Utd')
combined_df['Team'] = combined_df['Team'].replace('Stoke', 'Stoke City')
combined_df['Team'] = combined_df['Team'].replace('Wigan', 'Wigan Athletic')
combined_df['Team'] = combined_df['Team'].replace('Norwich', 'Norwich City')
combined_df['Team'] = combined_df['Team'].replace('Swansea', 'Swansea City')
combined_df['Team'] = combined_df['Team'].replace('Cardiff', 'Cardiff City')
combined_df['Team'] = combined_df['Team'].replace('Hull', 'Hull City')
combined_df['Team'] = combined_df['Team'].replace('Leicester', 'Leicester City')


# Rearrange the columns to move the 'Season_End_Year' column right after the 'Team' column.
# This will make it easier to find/read as knowing the corresponding Season of each row's data is important to our later analyses
cols = combined_df.columns.tolist()
cols = [cols[0]] + [cols[-1]] + cols[1:-1]
combined_df = combined_df[cols]

# View and double check the modified 'combined_df'
#print(combined_df.head())

# Print Selected columns of 'combined_df' to avoid glitching when outputting on github
selected_columns = combined_df.iloc[:, :10]
print(selected_columns.head())

              Team  Season_End_Year  M1GoalsScored  M2GoalsScored  M3GoalsScored  M4GoalsScored  M5GoalsScored  M6GoalsScored  M7GoalsScored  M8GoalsScored
0          Arsenal             2011              1              6              2              4              1              2              0              2
1      Aston Villa             2011              3              0              1              1              1              2              1              0
2  Birmingham City             2011              2              2              2              0              1              0              0              1
3        Blackburn             2011              1              1              1              1              1              2              0              0
4        Blackpool             2011              4              0              2              2              0              1              2              2

# create the last temp_df by calculating the total corner kicks, fouls, yellow cards, and red cards for each team and season group
drop_columns = list(range(1, 3)) + list(range(5, 11)) + list(range(15, 19))
temp_df.drop(temp_df.columns[drop_columns], inplace=True, axis=1)
blank_df = pd.DataFrame(columns=['Season_End_Year', 'Team', 'CornerKicks', 'Fouls', 'YellowCard', 'RedCard'])
temp_rows = []
current_season_end_year = None

# Iterate through each row in the df DataFrame
for index, row in temp_df.iterrows():

    # Extract season end year
    season_end_year = int(row['Season'][-2:])

    # Set values to add into the array to create data frame
    home_team = row['HomeTeam']
    away_team = row['AwayTeam']
    home_corner = row['HomeCorners']
    away_corner = row['AwayCorners']
    home_fouls = row['HomeFouls']
    away_fouls = row['AwayFouls']
    home_yello = row['HomeYellowCards']
    away_yello = row['AwayYellowCards']
    home_red = row['HomeRedCards']
    away_red = row['AwayRedCards']

    # Add values to array to create dataframe
    temp_rows.append({'Season_End_Year': season_end_year, 'Team': home_team, 'CornerKicks': home_corner, 'Fouls': home_fouls, 'YellowCard': home_yello, 'RedCard': home_red})
    temp_rows.append({'Season_End_Year': season_end_year, 'Team': away_team, 'CornerKicks': away_corner, 'Fouls': away_fouls, 'YellowCard': away_yello, 'RedCard': away_red})

# Fill in blank dataframe with values from above
blank_df = pd.DataFrame(temp_rows)

# Group the values by season and team and sum the values
stats_df = blank_df.groupby(['Season_End_Year', 'Team'], as_index=False).sum()
stats_df['Season_End_Year'] = '20' + stats_df['Season_End_Year'].astype(str)
stats_df['Season_End_Year'] = stats_df['Season_End_Year'].astype(int)


# The filtered_df and combined_df have variations of the same column/value types which makes it hard to analyze them later in case of merging or grouping
# For example, combined_df refers to teams as 'Clubs' and filtered_df refers to teams as 'Team'
# In addition, combined_df refers to certain teams with different names such as using 'Birmingham' instead of 'Birmingham City' as used in filtered_df
# Thus, we must standardize the columns and values for each table by replacing alternative/short names with full/shared names to maintain consistency.
stats_df['Team'] = stats_df['Team'].replace('Birmingham', 'Birmingham City')
stats_df['Team'] = stats_df['Team'].replace('Man City', 'Manchester City')
stats_df['Team'] = stats_df['Team'].replace('Man Utd', 'Manchester Utd')
stats_df['Team'] = stats_df['Team'].replace('Man Utd', 'Manchester Utd')
stats_df['Team'] = stats_df['Team'].replace('Newcastle', 'Newcastle Utd')
stats_df['Team'] = stats_df['Team'].replace('Stoke', 'Stoke City')
stats_df['Team'] = stats_df['Team'].replace('Wigan', 'Wigan Athletic')
stats_df['Team'] = stats_df['Team'].replace('Norwich', 'Norwich City')
stats_df['Team'] = stats_df['Team'].replace('Swansea', 'Swansea City')
stats_df['Team'] = stats_df['Team'].replace('Cardiff', 'Cardiff City')
stats_df['Team'] = stats_df['Team'].replace('Hull', 'Hull City')
stats_df['Team'] = stats_df['Team'].replace('Leicester', 'Leicester City')


print(stats_df.head())

   Season_End_Year             Team  CornerKicks  Fouls  YellowCard  RedCard
0             2011          Arsenal          252    432          68        6
1             2011      Aston Villa          235    437          71        2
2             2011  Birmingham City          152    399          57        3
3             2011        Blackburn          175    455          65        4
4             2011        Blackpool          186    403          47        2

# Merge combined_df and filtered_df to show the combined statistics for all Teams from Seasons 2010-2011 to 2019-2020
complete_data = pd.merge(combined_df, filtered_df, on=['Season_End_Year', 'Team'])
#move the columns Rk, MP, W, D, L, GF, GA, GD, and Pts to start after 'Season_End_Year'
cols = complete_data.columns.tolist()
move_cols = ['Rk', 'MP', 'W', 'D', 'L', 'GF', 'GA', 'GD', 'Pts']
new_order = cols[:2] + move_cols + cols[2:-9]
complete_data = complete_data[new_order]


# Merge combined_df and stats_df to show the combined statistics for all Teams from Seasons 2010-2011 to 2019-2020
complete_data = pd.merge(complete_data, stats_df, on=['Season_End_Year', 'Team'])
#move the columns CornerKicks  Fouls  YellowCard  RedCard to start after 'Season_End_Year'
cols = complete_data.columns.tolist()
move_cols = ['CornerKicks', 'Fouls', 'YellowCard', 'RedCard']
new_order = cols[:11] + move_cols + cols[11:-4]
complete_data = complete_data[new_order]


# calculate columns to add: Total shots, Total shots on target, Percent Shots Scored (goals / total shots)
complete_data['TotalShots'] = combined_df.loc[:, 'M1Shots':'M38Shots'].sum(axis=1)
complete_data['TotalShotsOnTarget'] = combined_df.loc[:, 'M1ShotsOnTarget':'M38ShotsOnTarget'].sum(axis=1)
complete_data['PercentShotsScored'] = complete_data['GF'] / complete_data['TotalShots']
# shift the columns again
cols = complete_data.columns.tolist()
move_cols = ['TotalShots', 'TotalShotsOnTarget', 'PercentShotsScored']
new_order = cols[:15] + move_cols + cols[15:-3]
complete_data = complete_data[new_order]




#drop unnecessary columns
complete_data.drop(complete_data.iloc[:, 18:], inplace= True, axis=1)

# View and double check the encompassing 'merged_data'
print(complete_data.head())

# Checks for missing data
print("Total Missing values found in complete_data:")
print(complete_data.isnull().sum().sum())

              Team  Season_End_Year  Rk  MP   W   D   L  GF  GA  GD  Pts  CornerKicks  Fouls  YellowCard  RedCard  TotalShots  TotalShotsOnTarget  PercentShotsScored
0          Arsenal             2011   4  38  19  11   8  72  43  29   68          252    432          68        6         595                 342            0.121008
1      Aston Villa             2011   9  38  12  12  14  48  59 -11   48          235    437          71        2         436                 241            0.110092
2  Birmingham City             2011  18  38   8  15  15  37  58 -21   39          152    399          57        3         327                 155            0.113150
3        Blackburn             2011  15  38  11  10  17  46  59 -13   43          175    455          65        4         385                 191            0.119481
4        Blackpool             2011  19  38  10   9  19  55  78 -23   39          186    403          47        2         446                 235            0.123318
Total Missing values found in complete_data:
0

###Points across Years

# Take out all missing data that is labeled as -1 and every year that is Not between(1990-2014)
filtered_df = filtered_df[(filtered_df['Season_End_Year'] != -1) & (filtered_df['Season_End_Year'].between(2011, 2020))]

# Total Points vs Season of each Team
plt.figure(figsize=(9, 6))
# Create a plot with lines for each team
for teamName, team_data in filtered_df.groupby('Team'):
    plt.plot(team_data['Season_End_Year'], team_data['Pts'], label=teamName)

# Set plot labels and title
plt.xlabel('Season End Year')
plt.ylabel('Total Points Accumulated')
plt.title(' Total Points vs. Season of each Team 2010/2011 to 2019/2020')
plt.legend(loc='upper left', bbox_to_anchor=(1, 1), ncol = 2)
plt.grid(True)
plt.tight_layout()
plt.show()

### Lineplots of Season vs. Factors

# Columns to plot
columns_to_plot = ['GF', 'GA', 'GD', 'CornerKicks', 'Fouls', 'YellowCard', 'RedCard', 'TotalShots', 'TotalShotsOnTarget', 'PercentShotsScored']

# Determine number of rows required to plot all columns (3 columns in each row)
num_rows = (len(columns_to_plot) + 2) // 3

# Create a single figure and multiple subplots
fig, axes = plt.subplots(num_rows, 3, figsize=(18, 6 * num_rows))

# Flatten the axes object to iterate easily
axes = axes.ravel()

# Using enumeration loop to plot each column
for idx, (column, ax) in enumerate(zip(columns_to_plot, axes), 1):

    # Plotting using seaborn for easy grouped line plots
    sns.lineplot(data=complete_data, x="Season_End_Year", y=column, hue="Team", ax=ax)

    ax.set_title(f"{idx}. {column} by Season")
    ax.set_ylabel(column)
    ax.set_xlabel("Season_End_Year")
    ax.legend().set_visible(False)  # Hide legend for clarity

# Hide any remaining unused subplots (if any)
for ax in axes[len(columns_to_plot):]:
    ax.axis('off')

# Adjust layout
plt.tight_layout()
plt.grid(True)
plt.show()

### Lineplots of Season vs. Mean Factors

# Columns to plot
columns_to_plot = ['GF', 'GA', 'GD', 'CornerKicks', 'Fouls', 'YellowCard', 'RedCard', 'TotalShots', 'TotalShotsOnTarget', 'PercentShotsScored']

# Group data by Season_End_Year and calculate the mean
grouped_data = complete_data.groupby('Season_End_Year').mean().reset_index()

# Determine number of rows required to plot all columns (3 columns in each row)
num_rows = (len(columns_to_plot) + 2) // 3

# Create a single figure and multiple subplots
fig, axes = plt.subplots(num_rows, 3, figsize=(18, 6 * num_rows))

# Flatten the axes object to iterate easily
axes = axes.ravel()

# Using enumeration loop to plot each column
for idx, (column, ax) in enumerate(zip(columns_to_plot, axes), 1):

    # Plotting using seaborn for easy grouped line plots
    sns.lineplot(data=grouped_data, y=column, x="Season_End_Year", ax=ax)

    ax.set_title(f"{idx}. Mean {column} by Season")
    ax.set_ylabel(column)
    ax.set_xlabel("Season_End_Year")

# Hide any remaining unused subplots (if any)
for ax in axes[len(columns_to_plot):]:
    ax.axis('off')

# Adjust layout
plt.tight_layout()
plt.show()

<ipython-input-23-867cb8230581>:7: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  grouped_data = complete_data.groupby('Season_End_Year').mean().reset_index()

### Scatterplots Pts vs. Factors

# List of columns to plot
columns_to_plot = ['GF', 'GA', 'GD', 'CornerKicks', 'Fouls', 'YellowCard', 'RedCard', 'TotalShots', 'TotalShotsOnTarget', 'PercentShotsScored']

# Determine the number of rows required to plot all columns (3 columns in each row)
num_rows = (len(columns_to_plot) + 2) // 3

# Create a single figure and multiple subplots
fig, axes = plt.subplots(num_rows, 3, figsize=(18, 6 * num_rows))

# Flatten the axes object to iterate easily
axes = axes.ravel()

# Iterate over each column in columns_to_plot to create scatter plots against 'Pts' for each group
for idx, (column, ax) in enumerate(zip(columns_to_plot, axes), 1):

    # Plotting
    ax.scatter(complete_data[column], complete_data['Pts'], label=column, alpha=0.6)

  # Regression line computation using statsmodels
    X = sm.add_constant(complete_data[column])
    model = sm.OLS(complete_data['Pts'], X).fit()
    ax.plot(complete_data[column], model.predict(X), color='red')  # plot regression line
    ax.text(0.1, 0.9, f'y = {model.params[column]:.2f}x + {model.params["const"]:.2f}', transform=ax.transAxes, color="red")  # display equation
    ax.text(0.1, 0.8, f'p-value: {model.pvalues[column]:.5f}', transform=ax.transAxes, color="blue")  # display p-value


    ax.set_title(f"{idx}. {column} vs Points by Team and Season")
    ax.set_xlabel(f"{column}")
    ax.set_ylabel("Points")
    ax.legend()
    ax.grid(True)

# Hide any remaining unused subplots (if any)
for ax in axes[len(columns_to_plot):]:
    ax.axis('off')

plt.tight_layout()
plt.grid(True)
plt.show()

### Scatterplots of Mean Pts For Every Team vs. Mean Factors For Every Team from Seasons 2011-2020

# Group by 'Team' and compute the means
grouped_means = complete_data.groupby(['Team']).mean().reset_index()

# List of columns to plot
columns_to_plot = ['GF', 'GA', 'GD', 'CornerKicks', 'Fouls', 'YellowCard', 'RedCard', 'TotalShots', 'TotalShotsOnTarget', 'PercentShotsScored']

# Determine the number of rows required to plot all columns (3 columns in each row)
num_rows = (len(columns_to_plot) + 2) // 3

# Create a single figure and multiple subplots
fig, axes = plt.subplots(num_rows, 3, figsize=(18, 6 * num_rows))

# Flatten the axes object to iterate easily
axes = axes.ravel()

# Iterate over each column in columns_to_plot to create scatter plots against 'Pts' for each group
for idx, (column, ax) in enumerate(zip(columns_to_plot, axes), 1):

    # Plotting
    ax.scatter(grouped_means[column], grouped_means['Pts'], label=column, alpha=0.6)

    for i, team in enumerate(grouped_means['Team']):
        ax.annotate(team, (grouped_means[column][i], grouped_means['Pts'][i]), fontsize=8, alpha=0.6, ha='center')

  # Regression line computation using statsmodels
    X = sm.add_constant(grouped_means[column])
    model = sm.OLS(grouped_means['Pts'], X).fit()
    ax.plot(grouped_means[column], model.predict(X), color='red')  # plot regression line
    ax.text(0.1, 0.9, f'y = {model.params[column]:.2f}x + {model.params["const"]:.2f}', transform=ax.transAxes, color="red")  # display equation
    ax.text(0.1, 0.8, f'p-value: {model.pvalues[column]:.5f}', transform=ax.transAxes, color="blue")  # display p-value


    ax.set_title(f"{idx}. Mean {column} vs Mean Points by Team From 2011-2020")
    ax.set_xlabel(f"{column}")
    ax.set_ylabel("Points")
    ax.legend()
    ax.grid(True)

# Hide any remaining unused subplots (if any)
for ax in axes[len(columns_to_plot):]:
    ax.axis('off')

plt.tight_layout()
plt.grid(True)
plt.show()

# Compute the correlation matrix
corr_matrix = complete_data.corr()

# Set the figure size
plt.figure(figsize=(15, 10))

# Draw the heatmap
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5, fmt=".2f")

# Rotate y-axis labels for better readability
plt.yticks(rotation=0)
plt.xticks(rotation=90)

plt.show()

<ipython-input-26-b79b5a3f169f>:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  corr_matrix = complete_data.corr()

# Evaluate linear regression model using statsmdel OLS
stats = statsmodels.formula.api.ols(formula="Pts ~ GF", data=complete_data).fit()

# Print the summary
stats.summary()

# Evaluate linear regression model using statsmdel OLS
stats1 = statsmodels.formula.api.ols(formula="Pts ~ GA", data=complete_data).fit()

# Print the summary
stats1.summary()

# Evaluate linear regression model using statsmdel OLS
stats2 = statsmodels.formula.api.ols(formula="Pts ~ CornerKicks", data=complete_data).fit()

# Print the summary
stats2.summary()

# Evaluate linear regression model using statsmdel OLS
stats3 = statsmodels.formula.api.ols(formula="Pts ~ Fouls", data=complete_data).fit()

# Print the summary
stats3.summary()

# Evaluate linear regression model using statsmdel OLS
stats4 = statsmodels.formula.api.ols(formula="Pts ~ YellowCard", data=complete_data).fit()

# Print the summary
stats4.summary()

# Evaluate linear regression model using statsmdel OLS
stats5 = statsmodels.formula.api.ols(formula="Pts ~ RedCard", data=complete_data).fit()

# Print the summary
stats5.summary()

# Evaluate linear regression model using statsmdel OLS
stats6 = statsmodels.formula.api.ols(formula="Pts ~ TotalShots", data=complete_data).fit()

# Print the summary
stats6.summary()

# Evaluate linear regression model using statsmdel OLS
stats7 = statsmodels.formula.api.ols(formula="Pts ~ TotalShotsOnTarget", data=complete_data).fit()

# Print the summary
stats7.summary()

# Evaluate linear regression model using statsmdel OLS
stats8 = statsmodels.formula.api.ols(formula="Pts ~ PercentShotsScored", data=complete_data).fit()

# Print the summary
stats8.summary()

# Evaluate linear regression model using statsmdel OLS
stats9 = statsmodels.formula.api.ols(formula="Pts ~ GF + GA + Fouls + YellowCard + RedCard + TotalShots + TotalShotsOnTarget + PercentShotsScored", data=complete_data).fit()

# Print the summary
stats9.summary()

# Copy complete data from above
compare_data = complete_data.copy()

# Drop unnecessary column
compare_data = compare_data.drop(['Pts'], axis=1)

# Get the prediction data based on the multivariate linear regression model
prediction = stats9.predict(compare_data)

# Copy complete data for comparison of actual data vs prediction data
result = complete_data.copy()

# Drop unnecessary columns and add prediction data into comparison table
result = result.drop(['Rk',  'MP',   'W',   'D',   'L',   'GF',  'GA',  'GD', 'CornerKicks',  'Fouls',  'YellowCard',  'RedCard',  'TotalShots',  'TotalShotsOnTarget',  'PercentShotsScored'], axis=1)
result['Predicted_Pts'] = prediction
result = result.rename(columns={'Pts':'Actual_Pts'})

# Print the comparison table
print(result.head())

              Team  Season_End_Year  Actual_Pts  Predicted_Pts
0          Arsenal             2011          68      69.802765
1      Aston Villa             2011          48      45.210850
2  Birmingham City             2011          39      39.822635
3        Blackburn             2011          43      44.515867
4        Blackpool             2011          39      38.873528

# Set up the figure and axis
plt.figure(figsize=(10, 6))

# Scatter plot
sns.scatterplot(x='Actual_Pts', y='Predicted_Pts', data=result, color='blue')

# Add a line of perfect prediction
max_pts = max(result['Actual_Pts'].max(), result['Predicted_Pts'].max())
min_pts = min(result['Actual_Pts'].min(), result['Predicted_Pts'].min())
plt.plot([min_pts, max_pts], [min_pts, max_pts], color='red', linestyle='--')

# Add title and labels
plt.title('Actual Points vs Predicted Points')
plt.xlabel('Actual Points')
plt.ylabel('Predicted Points')
plt.grid(True)

# Display the plot
plt.tight_layout()
plt.show()

Dep. Variable:	Pts	R-squared:	0.830
Model:	OLS	Adj. R-squared:	0.829
Method:	Least Squares	F-statistic:	965.8
Date:	Mon, 21 Aug 2023	Prob (F-statistic):	4.38e-78
Time:	03:32:08	Log-Likelihood:	-677.90
No. Observations:	200	AIC:	1360.
Df Residuals:	198	BIC:	1366.
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	2.4506	1.684	1.455	0.147	-0.870	5.771
GF	0.9560	0.031	31.077	0.000	0.895	1.017

Omnibus:	0.203	Durbin-Watson:	2.018
Prob(Omnibus):	0.903	Jarque-Bera (JB):	0.355
Skew:	-0.026	Prob(JB):	0.837
Kurtosis:	2.800	Cond. No.	181.

Dep. Variable:	Pts	R-squared:	0.707
Model:	OLS	Adj. R-squared:	0.705
Method:	Least Squares	F-statistic:	476.8
Date:	Mon, 21 Aug 2023	Prob (F-statistic):	1.28e-54
Time:	03:32:12	Log-Likelihood:	-732.40
No. Observations:	200	AIC:	1469.
Df Residuals:	198	BIC:	1475.
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	111.9906	2.813	39.806	0.000	106.442	117.539
GA	-1.1439	0.052	-21.837	0.000	-1.247	-1.041

How To Win The English Premier League: A Final Tutorial¶

Github Repository¶

Table of Contents:¶

Introduction¶

Purpose Of This Tutorial¶

Glossary¶

Imports¶

Data Collection¶

Explanation of Data Collection Code¶

Data Processing¶

Explanation of Code Above¶

Explanation of Code Above¶

Explanation of Code Above¶

Explanation of Data Processing Code¶

Exploratory Analysis & Data Visualization¶

Analysis of 'Total Points vs Season of each Team'¶

Explanation of 'Total Points vs Season of each Team' Plot Significance¶

Lineplots of Season vs. Factors¶

Explanation of the 'Lineplots of Season vs. Factors' Above¶

Individual Subplot Explanation of Lineplots of Season vs. Factors' Above¶

Lineplots of Season_End_Year vs. Mean Factors¶

MEAN SEASON FACTORS VS YEARS ANALYSIS¶

Scatterplots Pts vs. Factors¶

Season Factors VS Points ScatterPlot- Analysis¶

Scatterplots of Mean Pts For Every Team vs. Mean Factors For Every Team from Seasons 2011-2020¶

Analysis Of Mean Pts For Every Team vs. Mean Factors For Every Team from Seasons 2011-2020¶

FOR MORE INFORMATION ABOUT PLAYING STYLES AND TECHNIQUES:¶

Model: Analysis, Hypothesis Testing, & ML¶

Explanation Of the Code Above¶

OLS Regression With Single Independent Variables¶

OLS Regression With Multiple Independent Variables and Prediction Comparison¶

Explanation of Code Above¶

Interpretation: Insight & Policy Decision¶

Omnibus:	2.202	Durbin-Watson:	1.744
Prob(Omnibus):	0.333	Jarque-Bera (JB):	2.197
Skew:	0.251	Prob(JB):	0.333
Kurtosis:	2.896	Cond. No.	226.

Dep. Variable:	Pts	R-squared:	0.510
Model:	OLS	Adj. R-squared:	0.508
Method:	Least Squares	F-statistic:	206.2
Date:	Mon, 21 Aug 2023	Prob (F-statistic):	1.63e-32
Time:	03:32:14	Log-Likelihood:	-783.66
No. Observations:	200	AIC:	1571.
Df Residuals:	198	BIC:	1578.
Df Model:	1
Covariance Type:	nonrobust

Omnibus:	2.552	Durbin-Watson:	1.954
Prob(Omnibus):	0.279	Jarque-Bera (JB):	2.151
Skew:	0.222	Prob(JB):	0.341
Kurtosis:	3.246	Cond. No.	1.20e+03

Dep. Variable:	Pts	R-squared:	0.070
Model:	OLS	Adj. R-squared:	0.066
Method:	Least Squares	F-statistic:	14.99
Date:	Mon, 21 Aug 2023	Prob (F-statistic):	0.000147
Time:	03:32:16	Log-Likelihood:	-847.72
No. Observations:	200	AIC:	1699.
Df Residuals:	198	BIC:	1706.
Df Model:	1
Covariance Type:	nonrobust

Omnibus:	7.850	Durbin-Watson:	1.621
Prob(Omnibus):	0.020	Jarque-Bera (JB):	7.224
Skew:	0.403	Prob(JB):	0.0270
Kurtosis:	2.533	Cond. No.	4.23e+03

Dep. Variable:	Pts	R-squared:	0.046
Model:	OLS	Adj. R-squared:	0.041
Method:	Least Squares	F-statistic:	9.496
Date:	Mon, 21 Aug 2023	Prob (F-statistic):	0.00235
Time:	03:32:19	Log-Likelihood:	-850.34
No. Observations:	200	AIC:	1705.
Df Residuals:	198	BIC:	1711.
Df Model:	1
Covariance Type:	nonrobust

Omnibus:	11.220	Durbin-Watson:	1.616
Prob(Omnibus):	0.004	Jarque-Bera (JB):	12.011
Skew:	0.581	Prob(JB):	0.00247
Kurtosis:	2.702	Cond. No.	390.

Dep. Variable:	Pts	R-squared:	0.012
Model:	OLS	Adj. R-squared:	0.007
Method:	Least Squares	F-statistic:	2.388
Date:	Mon, 21 Aug 2023	Prob (F-statistic):	0.124
Time:	03:32:22	Log-Likelihood:	-853.82
No. Observations:	200	AIC:	1712.
Df Residuals:	198	BIC:	1718.
Df Model:	1
Covariance Type:	nonrobust

Omnibus:	11.792	Durbin-Watson:	1.638
Prob(Omnibus):	0.003	Jarque-Bera (JB):	12.784
Skew:	0.605	Prob(JB):	0.00168
Kurtosis:	2.734	Cond. No.	6.27

Dep. Variable:	Pts	R-squared:	0.611
Model:	OLS	Adj. R-squared:	0.609
Method:	Least Squares	F-statistic:	311.6
Date:	Mon, 21 Aug 2023	Prob (F-statistic):	1.64e-42
Time:	03:32:24	Log-Likelihood:	-760.49
No. Observations:	200	AIC:	1525.
Df Residuals:	198	BIC:	1532.
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	-18.5494	5.010	-3.702	0.000	-28.430	-8.669
CornerKicks	0.3468	0.024	14.360	0.000	0.299	0.394

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	99.7441	12.307	8.105	0.000	75.475	124.013
Fouls	-0.1162	0.030	-3.872	0.000	-0.175	-0.057

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	75.1675	7.512	10.006	0.000	60.353	89.982
YellowCard	-0.3689	0.120	-3.081	0.002	-0.605	-0.133

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	55.2518	2.260	24.445	0.000	50.795	59.709
RedCard	-1.0981	0.711	-1.545	0.124	-2.499	0.303

Omnibus:	6.095	Durbin-Watson:	1.957
Prob(Omnibus):	0.047	Jarque-Bera (JB):	5.805
Skew:	0.356	Prob(JB):	0.0549
Kurtosis:	3.434	Cond. No.	2.75e+03

Dep. Variable:	Pts	R-squared:	0.288
Model:	OLS	Adj. R-squared:	0.284
Method:	Least Squares	F-statistic:	80.10
Date:	Mon, 21 Aug 2023	Prob (F-statistic):	2.59e-16
Time:	03:32:26	Log-Likelihood:	-821.05
No. Observations:	200	AIC:	1646.
Df Residuals:	198	BIC:	1653.
Df Model:	1
Covariance Type:	nonrobust

Omnibus:	7.301	Durbin-Watson:	1.351
Prob(Omnibus):	0.026	Jarque-Bera (JB):	7.252
Skew:	0.464	Prob(JB):	0.0266
Kurtosis:	3.095	Cond. No.	636.

Dep. Variable:	Pts	R-squared:	0.538
Model:	OLS	Adj. R-squared:	0.536
Method:	Least Squares	F-statistic:	230.5
Date:	Mon, 21 Aug 2023	Prob (F-statistic):	4.93e-35
Time:	03:32:29	Log-Likelihood:	-777.82
No. Observations:	200	AIC:	1560.
Df Residuals:	198	BIC:	1566.
Df Model:	1
Covariance Type:	nonrobust

Omnibus:	0.071	Durbin-Watson:	1.711
Prob(Omnibus):	0.965	Jarque-Bera (JB):	0.202
Skew:	0.005	Prob(JB):	0.904
Kurtosis:	2.845	Cond. No.	50.3

Dep. Variable:	Pts	R-squared:	0.941
Model:	OLS	Adj. R-squared:	0.938
Method:	Least Squares	F-statistic:	380.0
Date:	Mon, 21 Aug 2023	Prob (F-statistic):	6.35e-113
Time:	03:32:33	Log-Likelihood:	-572.18
No. Observations:	200	AIC:	1162.
Df Residuals:	191	BIC:	1192.
Df Model:	8
Covariance Type:	nonrobust

Omnibus:	3.034	Durbin-Watson:	2.248
Prob(Omnibus):	0.219	Jarque-Bera (JB):	2.778
Skew:	0.178	Prob(JB):	0.249
Kurtosis:	3.454	Cond. No.	1.97e+05