# system
import warnings
import time
from pathlib import Path
import itertools
import textwrap

# data
import requests
from bs4 import BeautifulSoup, Comment, MarkupResemblesLocatorWarning
import pandas as pd
import numpy as np

# visualization
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# modeling
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    MinMaxScaler,
    MaxAbsScaler,
    RobustScaler,
)
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    AdaBoostRegressor,
)
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import skops.io as sio

# warnings
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning, module="bs4")

# requests
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
}

# pandas
pd.set_option("display.max_columns", None)

# plotly
pio.renderers.default = "notebook+plotly_mimetype+png+jpeg+svg+pdf"


def pages_to_scrape(season):
    return {
        f"https://www.basketball-reference.com/leagues/NBA_{season}.html": [
            {
                "id": "per_game-team",
                "path": f"data/raw/{season}/regular_season/per_game_stats.csv",
            },
            {
                "id": "advanced-team",
                "path": f"data/raw/{season}/regular_season/advanced_stats.csv",
            },
        ],
        f"https://www.basketball-reference.com/leagues/NBA_{season}_standings.html": [
            {
                "id": "expanded_standings",
                "path": f"data/raw/{season}/regular_season/standings.csv",
            }
        ],
        f"https://www.basketball-reference.com/playoffs/NBA_{season}.html": [
            {
                "id": "advanced-team",
                "path": f"data/raw/{season}/playoffs/advanced_stats.csv",
            }
        ],
    }


# get list of seasons
seasons = list(range(2003, 2023 + 1))

# go through seasons
for season in seasons:

    # get pages to scrape for season
    pages = pages_to_scrape(season)

    # go through pages
    for url, infoList in pages.items():

        # skip if all CSV files exist
        if all([Path(info["path"]).exists() for info in infoList]):
            continue

        # request data from page
        page = requests.get(url, headers=headers)
        soup = BeautifulSoup(page.content, "html.parser")

        # go through tables
        for info in infoList:

            # skip if CSV file exists
            if Path(info["path"]).exists():
                continue

            # find table
            table = soup.find("table", id=info["id"])
            if table is None:
                for comment in soup.find_all(
                    string=lambda text: isinstance(text, Comment)
                ):
                    comment_soup = BeautifulSoup(comment, "html.parser")
                    table = comment_soup.find("table", id=info["id"])
                    if table is not None:
                        break

            # convert table to DataFrame and save as CSV
            df = pd.read_html(str(table))[0]
            Path(info["path"]).parent.mkdir(parents=True, exist_ok=True)
            df.to_csv(info["path"], index=False)

        # sleep for 10 seconds before next request
        time.sleep(10)


def rename_column(name):
    if "Unnamed" in name:
        return ""
    elif name == "Rk":
        return "Rank"
    elif name == "Tm":
        return "Team"
    elif name == "Offense Four Factors":
        return ""
    elif name == "Defense Four Factors":
        return "Opp"
    else:
        return name


def rename_columns(df, header):
    if len(header) == 1:
        df.columns = [rename_column(i) for i in df.columns]
    else:
        df.columns = [f"{rename_column(i)}{rename_column(j)}" for i, j in df.columns]
    return df


files_to_clean = {
    "regular_season/per_game_stats.csv": {
        "header": [0],
        "columns": [
            "Team",
            "FG",
            "FGA",
            "FG%",
            "3P",
            "3PA",
            "3P%",
            "2P",
            "2PA",
            "2P%",
            "FT",
            "FTA",
            "FT%",
            "ORB",
            "DRB",
            "TRB",
            "AST",
            "STL",
            "BLK",
            "TOV",
            "PF",
            "PTS",
        ],
        "column_mappings": {},
    },
    "regular_season/advanced_stats.csv": {
        "header": [0, 1],
        "columns": [
            "Team",
            "SRS",
            "ORtg",
            "DRtg",
            "NRtg",
            "Pace",
            "FTr",
            "3PAr",
            "TS%",
            "eFG%",
            "TOV%",
            "ORB%",
            "FT/FGA",
            "OppeFG%",
            "OppTOV%",
            "OppDRB%",
            "OppFT/FGA",
        ],
        "column_mappings": {"OppDRB%": "DRB%"},
    },
    "regular_season/standings.csv": {
        "header": [0, 1],
        "columns": [
            "Rank",
            "Team",
            "Overall",
            "PlaceHome",
            "PlaceRoad",
            "ConferenceE",
            "ConferenceW",
        ],
        "column_mappings": {
            "Overall": "OverallRecord",
            "PlaceHome": "HomeRecord",
            "PlaceRoad": "RoadRecord",
            "ConferenceE": "EastRecord",
            "ConferenceW": "WestRecord",
        },
    },
    "playoffs/advanced_stats.csv": {
        "header": [0, 1],
        "columns": ["Rank", "Team", "W", "L"],
        "column_mappings": {"Rank": "PlayoffRank", "W": "PlayoffW", "L": "PlayoffL"},
    },
}


def clean_csv(path):

    # obtain information needed to clean CSV file
    name = f"{Path(path).parents[0].name}/{Path(path).name}"
    info = files_to_clean[name]

    # read CSV file
    df = pd.read_csv(path, header=info["header"])

    # rename columns
    rename_columns(df, info["header"])

    # remove unnecessary columns
    df = df[info["columns"]]

    # rename remaining columns
    df = df.rename(columns=info["column_mappings"])

    # remove "League Average" row
    df = df[df["Team"] != "League Average"]

    # remove "*" from team names
    df["Team"] = df["Team"].str.replace("*", "", regex=False)

    # make team names consistent
    df["Team"] = df["Team"].str.replace(
        "Seattle Supersonics", "Seattle SuperSonics", regex=False
    )
    df["Team"] = df["Team"].str.replace(
        "Charlotte Bobcats", "Charlotte Hornets", regex=False
    )

    # return DataFrame
    return df


# create main DataFrame
data = pd.DataFrame()

# go through seasons
for season in seasons:

    # initialize season DataFrame
    season_data = None

    # get list of CSV files for season
    infoList = list(itertools.chain(*pages_to_scrape(season).values()))
    pathList = [info["path"] for info in infoList]

    # go through CSV files
    for index, path in enumerate(pathList):

        # clean CSV file and store as DataFrame
        df = clean_csv(path)

        # merge DataFrame with season DataFrame
        if index == 0:
            season_data = df
        else:
            season_data = season_data.merge(df, on="Team", how="outer")

    # add season column at beginning
    season_data.insert(0, "Season", season)

    # add season DataFrame to main DataFrame
    data = pd.concat(objs=[data, season_data])


data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 628 entries, 0 to 29
Data columns (total 48 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Season         628 non-null    int64  
 1   Team           628 non-null    object 
 2   FG             628 non-null    float64
 3   FGA            628 non-null    float64
 4   FG%            628 non-null    float64
 5   3P             628 non-null    float64
 6   3PA            628 non-null    float64
 7   3P%            628 non-null    float64
 8   2P             628 non-null    float64
 9   2PA            628 non-null    float64
 10  2P%            628 non-null    float64
 11  FT             628 non-null    float64
 12  FTA            628 non-null    float64
 13  FT%            628 non-null    float64
 14  ORB            628 non-null    float64
 15  DRB            628 non-null    float64
 16  TRB            628 non-null    float64
 17  AST            628 non-null    float64
 18  STL            628 non-null    float64
 19  BLK            628 non-null    float64
 20  TOV            628 non-null    float64
 21  PF             628 non-null    float64
 22  PTS            628 non-null    float64
 23  SRS            628 non-null    float64
 24  ORtg           628 non-null    float64
 25  DRtg           628 non-null    float64
 26  NRtg           628 non-null    float64
 27  Pace           628 non-null    float64
 28  FTr            628 non-null    float64
 29  3PAr           628 non-null    float64
 30  TS%            628 non-null    float64
 31  eFG%           628 non-null    float64
 32  TOV%           628 non-null    float64
 33  ORB%           628 non-null    float64
 34  FT/FGA         628 non-null    float64
 35  OppeFG%        628 non-null    float64
 36  OppTOV%        628 non-null    float64
 37  DRB%           628 non-null    float64
 38  OppFT/FGA      628 non-null    float64
 39  Rank           628 non-null    int64  
 40  OverallRecord  628 non-null    object 
 41  HomeRecord     628 non-null    object 
 42  RoadRecord     628 non-null    object 
 43  EastRecord     628 non-null    object 
 44  WestRecord     628 non-null    object 
 45  PlayoffRank    340 non-null    float64
 46  PlayoffW       340 non-null    float64
 47  PlayoffL       340 non-null    float64
dtypes: float64(40), int64(2), object(6)
memory usage: 240.4+ KB


data


data = data.dropna(subset=["PlayoffRank", "PlayoffW", "PlayoffL"])
data = data[(data["PlayoffW"] != 0) | (data["PlayoffL"] != 0)]
data = data.reset_index(drop=True)


data = data.astype({"PlayoffRank": "int64", "PlayoffW": "int64", "PlayoffL": "int64"})


def get_conference(row):
    east_wins, east_losses = row["EastRecord"].split("-")
    west_wins, west_losses = row["WestRecord"].split("-")
    east_games = int(east_wins) + int(east_losses)
    west_games = int(west_wins) + int(west_losses)
    return "East" if east_games > west_games else "West"


data.insert(2, "Conference", data.apply(get_conference, axis=1))


data.insert(
    3,
    "Seed",
    data.groupby(["Season", "Conference"])["Rank"]
    .rank(method="dense", ascending=True)
    .astype("int64"),
)


def record_to_win_pct(record):
    wins, losses = record.split("-")
    return int(wins) / (int(wins) + int(losses))


for (win_pct, record) in [
    ("W%", "OverallRecord"),
    ("HomeW%", "HomeRecord"),
    ("RoadW%", "RoadRecord"),
    ("EastW%", "EastRecord"),
    ("WestW%", "WestRecord"),
]:
    data[record] = data[record].apply(record_to_win_pct)
    data = data.rename(columns={record: win_pct})


data.insert(
    len(data.columns) - 3,
    "ConferenceW%",
    data.apply(
        lambda row: row["EastW%"] if row["Conference"] == "East" else row["WestW%"],
        axis=1,
    ),
)


data_labels = {
    "Season": "Season",
    "Team": "Team",
    "Conference": "Conference",
    "Seed": "Seed",
    "FG": "Field Goals (FG)",
    "FGA": "Field Goal Attempts (FGA)",
    "FG%": "Field Goal Percentage (FG%)",
    "3P": "3-Point Field Goals (3P)",
    "3PA": "3-Point Field Goal Attempts (3PA)",
    "3P%": "3-Point Field Goal Percentage (3P%)",
    "2P": "2-Point Field Goals (2P)",
    "2PA": "2-Point Field Goal Attempts (2PA)",
    "2P%": "2-Point Field Goal Percentage (2P%)",
    "FT": "Free Throws (FT)",
    "FTA": "Free Throw Attempts (FTA)",
    "FT%": "Free Throw Percentage (FT%)",
    "ORB": "Offensive Rebounds (ORB)",
    "DRB": "Defensive Rebounds (DRB)",
    "TRB": "Total Rebounds (TRB)",
    "AST": "Assists (AST)",
    "STL": "Steals (STL)",
    "BLK": "Blocks (BLK)",
    "TOV": "Turnovers (TOV)",
    "PF": "Personal Fouls (PF)",
    "PTS": "Points (PTS)",
    "SRS": "Simple Rating System (SRS)",
    "ORtg": "Offensive Rating (ORtg)",
    "DRtg": "Defensive Rating (DRtg)",
    "NRtg": "Net Rating (NRtg)",
    "Pace": "Pace Factor (Pace)",
    "FTr": "Free Throw Attempt Rate (FTr)",
    "3PAr": "3-Point Attempt Rate (3PAr)",
    "TS%": "True Shooting Percentage (TS%)",
    "eFG%": "Effective Field Goal Percentage (eFG%)",
    "TOV%": "Turnover Percentage (TOV%)",
    "ORB%": "Offensive Rebound Percentage (ORB%)",
    "FT/FGA": "Free Throws Per Field Goal Attempt (FT/FGA)",
    "OppeFG%": "Opponent Effective Field Goal Percentage (OppeFG%)",
    "OppTOV%": "Opponent Turnover Percentage (OppTOV%)",
    "DRB%": "Defensive Rebound Percentage (DRB%)",
    "OppFT/FGA": "Opponent Free Throws Per Field Goal Attempt (OppFT/FGA)",
    "Rank": "Rank",
    "W%": "Winning Percentage",
    "HomeW%": "Home Winning Percentage",
    "RoadW%": "Road Winning Percentage",
    "EastW%": "East Winning Percentage",
    "WestW%": "West Winning Percentage",
    "ConferenceW%": "Conference Winning Percentage",
    "PlayoffRank": "Playoff Rank",
    "PlayoffW": "Playoff Wins",
    "PlayoffL": "Playoff Losses",
}


data


past_data = data[data["Season"] < 2023]
current_data = data[data["Season"] == 2023]


def get_subplots(df, labels, X_names, y_name, rows, cols):

    # create subplot titles
    titles = []
    width = 30
    for x_name in X_names:
        title = f"{labels[y_name]} vs. {labels[x_name]}"
        title = "<br>".join(textwrap.wrap(title, width=width))
        titles.append(title)

    # create figure with subplots
    fig = make_subplots(rows=rows, cols=cols, subplot_titles=titles)

    # go through each subplot
    for (index, x_name) in enumerate(X_names):

        # get row and column of subplot
        row = index // cols + 1
        col = index % cols + 1

        # create hovertemplate for subplot
        hovertemplate = "<br>".join(
            [
                "%{xaxis.title.text}=%{x}",
                "%{yaxis.title.text}=%{y}",
                "Team=%{customdata[0]}",
                "Season=%{customdata[1]}",
            ]
        )

        # create subplot
        subplot = go.Scatter(
            x=df[x_name],
            y=df[y_name],
            mode="markers",
            customdata=df[["Team", "Season"]],
            hovertemplate=hovertemplate,
            name="",
            showlegend=False,
        )

        # add subplot to figure
        fig.add_trace(subplot, row, col)
        fig.update_xaxes(title_text=labels[x_name], row=row, col=col)
        fig.update_yaxes(title_text=labels[y_name], row=row, col=col)

        # compute regression line for subplot
        model = LinearRegression()
        X = df[[x_name]].values
        y = df[y_name]
        model.fit(X, y)
        equation = f"y = {model.intercept_:.2f} + {model.coef_[0]:.2f}x"
        score = model.score(X, y)
        x_range = np.linspace(df[x_name].min(), df[x_name].max(), 100)
        y_range = model.predict(x_range.reshape(-1, 1))

        # create regression line for subplot
        line = go.Scatter(
            x=x_range,
            y=y_range,
            mode="lines",
            hovertemplate=f"{equation}<br>r^2 = {score}",
            name="Regression Line",
            showlegend=False,
            line={"color": "black"},
        )

        # add regression line for subplot to figure
        fig.add_trace(line, row, col)

    # update figure layout
    fig.update_layout(height=rows * 400)

    # return figure
    return fig


overall_rating_stats = [
    "Rank",
    "Seed",
    "SRS",
    "NRtg",
    "W%",
    "HomeW%",
    "RoadW%",
    "EastW%",
    "WestW%",
    "ConferenceW%",
]
fig = get_subplots(past_data, data_labels, overall_rating_stats, "PlayoffW", 4, 3)
fig.show()


offensive_volume_stats = ["FG", "3P", "2P", "FT", "PTS"]
fig = get_subplots(past_data, data_labels, offensive_volume_stats, "PlayoffW", 2, 3)
fig.show()


offensive_efficiency_stats = [
    "FG%",
    "3P%",
    "2P%",
    "FT%",
    "TS%",
    "eFG%",
    "FTr",
    "3PAr",
    "FT/FGA",
    "Pace",
    "ORtg",
]
fig = get_subplots(past_data, data_labels, offensive_efficiency_stats, "PlayoffW", 4, 3)
fig.show()


rebounding_stats = ["TRB", "ORB", "DRB", "ORB%", "DRB%"]
fig = get_subplots(past_data, data_labels, rebounding_stats, "PlayoffW", 2, 3)
fig.show()


passing_turnovers_stats = ["AST", "TOV", "TOV%"]
fig = get_subplots(past_data, data_labels, passing_turnovers_stats, "PlayoffW", 1, 3)
fig.show()


defense_stats = ["STL", "BLK", "PF", "OppeFG%", "OppTOV%", "OppFT/FGA", "DRtg"]
fig = get_subplots(past_data, data_labels, defense_stats, "PlayoffW", 3, 3)
fig.show()


cols = 4
rows = np.ceil((len(seasons) - 1) / cols).astype(int)

fig = px.scatter(
    past_data,
    x="ORtg",
    y="DRtg",
    size="PlayoffW",
    color="Team",
    facet_col="Season",
    title=f"{data_labels['DRtg']} vs. {data_labels['ORtg']} by {data_labels['Season']}",
    labels=data_labels,
    facet_col_wrap=cols,
    height=1200,
    facet_col_spacing=0.05,
)

fig.update_xaxes(matches=None, showticklabels=True)
fig.update_yaxes(matches=None, showticklabels=True)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

None


for index, season in enumerate(seasons[:-1]):

    row = rows - (index // cols)
    col = index % cols + 1

    season_data = past_data[past_data["Season"] == season]

    median_ORtg = season_data["ORtg"].median()
    median_DRtg = season_data["DRtg"].median()

    fig.add_vline(x=median_ORtg, line_width=2, line_color="black", row=row, col=col)
    fig.add_hline(y=median_DRtg, line_width=2, line_color="black", row=row, col=col)


fig.show()


fig = px.density_heatmap(
    past_data,
    x="Seed",
    y="TS%",
    z="PlayoffW",
    histfunc="avg",
    title=f"Average {data_labels['PlayoffW']} by {data_labels['Seed']} and {data_labels['TS%']}",
    labels={
        **data_labels,
        f"avg of {data_labels['PlayoffW']}": f"Average {data_labels['PlayoffW']}",
    },
    nbinsx=8,
    text_auto=".0f",
)

hovertemplate = "<br>".join(
    [
        "%{xaxis.title.text}=%{x}",
        "%{yaxis.title.text}=%{y}",
        f"Average {data_labels['PlayoffW']}=%{{z:.2f}}",
    ]
)

fig.update_layout(coloraxis={"colorbar": {"title": "Average Playoff Wins"}})
fig.update_traces(hovertemplate=hovertemplate)

fig.show()


X_names = past_data.columns.tolist()[:-3]


X_categorical_names = X_names[:3]
X_categorical_names

['Season', 'Team', 'Conference']


X_numerical_names = X_names[3:]
X_numerical_names

['Seed',
 'FG',
 'FGA',
 'FG%',
 '3P',
 '3PA',
 '3P%',
 '2P',
 '2PA',
 '2P%',
 'FT',
 'FTA',
 'FT%',
 'ORB',
 'DRB',
 'TRB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'PTS',
 'SRS',
 'ORtg',
 'DRtg',
 'NRtg',
 'Pace',
 'FTr',
 '3PAr',
 'TS%',
 'eFG%',
 'TOV%',
 'ORB%',
 'FT/FGA',
 'OppeFG%',
 'OppTOV%',
 'DRB%',
 'OppFT/FGA',
 'Rank',
 'W%',
 'HomeW%',
 'RoadW%',
 'EastW%',
 'WestW%',
 'ConferenceW%']


y_name = past_data.columns.tolist()[-2]
print(y_name)

PlayoffW


X = past_data[X_names]
y = past_data[y_name]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


categorical_transformer = Pipeline(
    steps=[("encoder", OneHotEncoder(handle_unknown="ignore"))]
)


numerical_transformer = Pipeline(steps=[("scaler", None)])

scalers = [StandardScaler(), MinMaxScaler(), MaxAbsScaler(), RobustScaler()]


preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", categorical_transformer, X_categorical_names),
        ("numerical", numerical_transformer, X_numerical_names),
    ]
)

preprocessor_params = {"preprocessor__numerical__scaler": scalers}


def feature_selector(model):
    return RFECV(model)


feature_selector_params = {
    "feature_selector__cv": [3, 5, 10],
}


models = {
    "Linear Regression": {
        "model": LinearRegression(),
        "params": {},
    },
    "Ridge Regression": {
        "model": Ridge(),
        "params": {"model__alpha": np.logspace(-4, 4, 9)},
    },
    "Lasso Regression": {
        "model": Lasso(),
        "params": {"model__alpha": np.logspace(-4, 4, 9)},
    },
    "Elastic Net Regression": {
        "model": ElasticNet(),
        "params": {
            "model__alpha": np.logspace(-4, 4, 9),
            "model__l1_ratio": np.linspace(0, 1, 11),
        },
    },
    "Decision Tree Regression": {
        "model": DecisionTreeRegressor(),
        "params": {
            "model__criterion": [
                "squared_error",
                "friedman_mse",
                "absolute_error",
                "poisson",
            ],
            "model__max_depth": [3, 5, 10, 15, 20, 25],
            "model__min_samples_split": [2, 5, 10, 15],
        },
    },
    "Random Forest Regression": {
        "model": RandomForestRegressor(),
        "params": {
            "model__n_jobs": [-1],
            "model__n_estimators": [10, 50, 100, 300, 500],
            "model__max_depth": [3, 5, 10, 15, 20, 25],
            "model__min_samples_split": [2, 5, 10, 15],
        },
    },
    "Gradient Boosting Regression": {
        "model": GradientBoostingRegressor(),
        "params": {
            "model__learning_rate": np.logspace(-2, 0, 3),
            "model__n_estimators": [10, 50, 100, 300, 500],
            "model__max_depth": [3, 5, 10, 15, 20, 25],
            "model__min_samples_split": [2, 5, 10, 15],
        },
    },
    "AdaBoost Regression": {
        "model": AdaBoostRegressor(),
        "params": {
            "model__n_estimators": [10, 50, 100, 300, 500],
            "model__learning_rate": np.logspace(-2, 0, 3),
            "model__loss": ["linear", "square", "exponential"],
        },
    },
}


# go through each model
for (name, m) in models.items():

    # print model name
    print(f"====== {name} ======")

    # build pipeline
    m["pipeline"] = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("feature_selector", feature_selector(m["model"])),
            ("model", m["model"]),
        ],
        memory="cache",
        verbose=True,
    )

    # build param grid
    m["param_grid"] = {**preprocessor_params, **feature_selector_params, **m["params"]}

    # get path for loading/saving model
    modified_name = name.replace(" ", "")
    path = f"models/{modified_name}.skops"

    # if fitted model has already been saved, load it
    if Path(path).exists():
        with open(path, "rb") as file:
            m["grid_search"] = sio.load(file, trusted=True)
            print(f'Loaded fitted model from "{path}".')

    # otherwise, proceed with fitting model
    else:

        # perform exhaustive grid search with cross-validation
        m["grid_search"] = GridSearchCV(m["pipeline"], m["param_grid"], cv=5, n_jobs=-1)
        m["grid_search"].fit(X_train, y_train)

        # save fitted model
        Path(path).parent.mkdir(parents=True, exist_ok=True)
        with open(path, "wb") as file:
            sio.dump(m["grid_search"], file)
            print(f'Saved fitted model to "{path}".')

    # print new line
    print("\n")

====== Linear Regression ======
Loaded fitted model from "models/LinearRegression.skops".


====== Ridge Regression ======
Loaded fitted model from "models/RidgeRegression.skops".


====== Lasso Regression ======
Loaded fitted model from "models/LassoRegression.skops".


====== Elastic Net Regression ======
Loaded fitted model from "models/ElasticNetRegression.skops".


====== Decision Tree Regression ======
Loaded fitted model from "models/DecisionTreeRegression.skops".


====== Random Forest Regression ======
Loaded fitted model from "models/RandomForestRegression.skops".


====== Gradient Boosting Regression ======
Loaded fitted model from "models/GradientBoostingRegression.skops".


====== AdaBoost Regression ======
Loaded fitted model from "models/AdaBoostRegression.skops".


model_data = pd.DataFrame(
    {
        "Model": [],
        "R2": [],
        "RMSE": [],
        "MAE": [],
    }
)

model_labels = {
    "Model": "Model",
    "R2": "Coefficient of Determination (R2)",
    "RMSE": "Root Mean Squared Error (RMSE)",
    "MAE": "Mean Absolute Error (MAE)",
}

for (name, m) in models.items():

    y_pred = m["grid_search"].predict(X_test)

    row = pd.DataFrame(
        [
            {
                "Model": name,
                "R2": r2_score(y_test, y_pred),
                "RMSE": mean_squared_error(y_test, y_pred, squared=False),
                "MAE": mean_absolute_error(y_test, y_pred),
            }
        ]
    )

    model_data = pd.concat([model_data, row]).reset_index(drop=True)

model_data


fig = px.bar(
    model_data,
    x="Model",
    y=["R2", "RMSE", "MAE"],
    title="Model Performance",
    labels=model_labels,
    barmode="group",
    height=600,
)

fig.update_layout(yaxis_title="Score", legend_title_text="Metric")

fig.for_each_trace(
    lambda t: t.update(
        name=model_labels[t.name],
        legendgroup=model_labels[t.name],
        hovertemplate="<br>".join(
            [
                "Model=%{x}",
                "Score=%{y}",
            ]
        ),
    )
)

fig.show()


pipeline = models["Decision Tree Regression"]["grid_search"].best_estimator_
feature_names = pipeline["preprocessor"].get_feature_names_out()
selected_features_mask = pipeline["feature_selector"].get_support()
selected_features = feature_names[selected_features_mask]
selected_features

array(['numerical__Seed'], dtype=object)


for (name, m) in models.items():
    m_pipeline = m["grid_search"].best_estimator_
    m_feature_names = m_pipeline["preprocessor"].get_feature_names_out()
    m_selected_features_mask = m_pipeline["feature_selector"].get_support()
    m_selected_features = m_feature_names[m_selected_features_mask]
    print(f"====== {name} ======")
    print(m_selected_features)
    print("\n")

====== Linear Regression ======
['categorical__Season_2003' 'categorical__Season_2004'
 'categorical__Season_2005' 'categorical__Season_2006'
 'categorical__Season_2007' 'categorical__Season_2008'
 'categorical__Season_2009' 'categorical__Season_2010'
 'categorical__Season_2011' 'categorical__Season_2012'
 'categorical__Season_2013' 'categorical__Season_2014'
 'categorical__Season_2015' 'categorical__Season_2016'
 'categorical__Season_2017' 'categorical__Season_2018'
 'categorical__Season_2019' 'categorical__Season_2020'
 'categorical__Season_2021' 'categorical__Season_2022'
 'categorical__Team_Cleveland Cavaliers'
 'categorical__Team_Detroit Pistons'
 'categorical__Team_Golden State Warriors'
 'categorical__Team_Los Angeles Lakers' 'categorical__Team_Miami Heat'
 'categorical__Team_New Jersey Nets' 'categorical__Team_Sacramento Kings'
 'numerical__FG' 'numerical__FGA' 'numerical__FG%' 'numerical__3P'
 'numerical__3PA' 'numerical__2P' 'numerical__2PA' 'numerical__FT'
 'numerical__FTA' 'numerical__ORB' 'numerical__DRB' 'numerical__TRB'
 'numerical__PTS' 'numerical__ORtg' 'numerical__DRtg' 'numerical__NRtg'
 'numerical__eFG%' 'numerical__W%' 'numerical__HomeW%' 'numerical__RoadW%']


====== Ridge Regression ======
['categorical__Team_Cleveland Cavaliers'
 'categorical__Team_Golden State Warriors'
 'categorical__Team_Los Angeles Lakers' 'categorical__Team_Miami Heat'
 'numerical__Seed' 'numerical__FT%' 'numerical__NRtg' 'numerical__TOV%']


====== Lasso Regression ======
['numerical__Seed' 'numerical__NRtg' 'numerical__OppFT/FGA'
 'numerical__Rank']


====== Elastic Net Regression ======
['numerical__Seed' 'numerical__SRS' 'numerical__NRtg'
 'numerical__OppFT/FGA' 'numerical__Rank']


====== Decision Tree Regression ======
['numerical__Seed']


====== Random Forest Regression ======
['numerical__Seed' 'numerical__DRtg' 'numerical__Rank']


====== Gradient Boosting Regression ======
['categorical__Season_2021' 'categorical__Team_Cleveland Cavaliers'
 'categorical__Team_Golden State Warriors'
 'categorical__Team_Los Angeles Lakers' 'categorical__Team_Miami Heat'
 'categorical__Team_San Antonio Spurs' 'numerical__Seed' 'numerical__FG'
 'numerical__FGA' 'numerical__FG%' 'numerical__3PA' 'numerical__3P%'
 'numerical__2P' 'numerical__2PA' 'numerical__FT' 'numerical__FTA'
 'numerical__FT%' 'numerical__ORB' 'numerical__TRB' 'numerical__AST'
 'numerical__BLK' 'numerical__TOV' 'numerical__PF' 'numerical__PTS'
 'numerical__SRS' 'numerical__ORtg' 'numerical__DRtg' 'numerical__NRtg'
 'numerical__3PAr' 'numerical__TS%' 'numerical__eFG%' 'numerical__TOV%'
 'numerical__ORB%' 'numerical__FT/FGA' 'numerical__OppeFG%'
 'numerical__OppTOV%' 'numerical__OppFT/FGA' 'numerical__Rank'
 'numerical__W%' 'numerical__HomeW%' 'numerical__RoadW%'
 'numerical__EastW%' 'numerical__WestW%' 'numerical__ConferenceW%']


====== AdaBoost Regression ======
['numerical__Seed' 'numerical__DRtg' 'numerical__eFG%' 'numerical__Rank'
 'numerical__HomeW%' 'numerical__ConferenceW%']


plot_tree(
    pipeline["model"],
    feature_names=selected_features,
    class_names=[y_name],
    filled=True,
    rounded=True,
)

[Text(0.4583333333333333, 0.875, 'numerical__Seed <= -0.429\nsquared_error = 22.424\nsamples = 256\nvalue = 5.328'),
 Text(0.25, 0.625, 'numerical__Seed <= -0.869\nsquared_error = 19.849\nsamples = 97\nvalue = 9.402'),
 Text(0.16666666666666666, 0.375, 'numerical__Seed <= -1.308\nsquared_error = 16.996\nsamples = 66\nvalue = 10.606'),
 Text(0.08333333333333333, 0.125, 'squared_error = 19.877\nsamples = 31\nvalue = 10.161'),
 Text(0.25, 0.125, 'squared_error = 14.114\nsamples = 35\nvalue = 11.0'),
 Text(0.3333333333333333, 0.375, 'squared_error = 16.264\nsamples = 31\nvalue = 6.839'),
 Text(0.6666666666666666, 0.625, 'numerical__Seed <= 0.45\nsquared_error = 7.692\nsamples = 159\nvalue = 2.843'),
 Text(0.5, 0.375, 'numerical__Seed <= 0.01\nsquared_error = 10.188\nsamples = 65\nvalue = 4.492'),
 Text(0.4166666666666667, 0.125, 'squared_error = 10.462\nsamples = 30\nvalue = 5.067'),
 Text(0.5833333333333334, 0.125, 'squared_error = 9.429\nsamples = 35\nvalue = 4.0'),
 Text(0.8333333333333334, 0.375, 'numerical__Seed <= 0.889\nsquared_error = 2.784\nsamples = 94\nvalue = 1.702'),
 Text(0.75, 0.125, 'squared_error = 4.379\nsamples = 30\nvalue = 2.567'),
 Text(0.9166666666666666, 0.125, 'squared_error = 1.521\nsamples = 64\nvalue = 1.297')]


decision_tree = models["Decision Tree Regression"]["grid_search"]

X_current = current_data[X_names]
current_pred = X_current.copy()
current_pred["PredPlayoffW"] = decision_tree.predict(X_current)
current_pred = current_pred[["Team", "Conference", "Seed", "PredPlayoffW"]]
current_pred = current_pred.sort_values(by="PredPlayoffW", ascending=False)
current_pred = current_pred.reset_index(drop=True)

current_pred


current_pred["ConferencePlayoffRank"] = (
    current_pred.groupby("Conference")["PredPlayoffW"]
    .rank(ascending=False, method="first")
    .astype("int64")
)


def playoff_status(row):
    if row["ConferencePlayoffRank"] <= 1:
        return "Finals"
    elif row["ConferencePlayoffRank"] <= 2:
        return "Conference Finals"
    elif row["ConferencePlayoffRank"] <= 4:
        return "Conference Semifinals"
    elif row["ConferencePlayoffRank"] <= 8:
        return "First Round"


current_pred["PlayoffStatus"] = current_pred.apply(playoff_status, axis=1)

current_pred

	Season	Team	FG	FGA	FG%	3P	3PA	3P%	2P	2PA	2P%	FT	FTA	FT%	ORB	DRB	TRB	AST	STL	BLK	TOV	PF	PTS	SRS	ORtg	DRtg	NRtg	Pace	FTr	3PAr	TS%	eFG%	TOV%	ORB%	FT/FGA	OppeFG%	OppTOV%	DRB%	OppFT/FGA	Rank	OverallRecord	HomeRecord	RoadRecord	EastRecord	WestRecord	PlayoffRank	PlayoffW	PlayoffL
0	2003	Dallas Mavericks	38.5	85.1	0.453	7.8	20.3	0.381	30.8	64.8	0.475	18.1	21.9	0.829	11.1	31.0	42.1	22.4	8.1	5.5	11.6	21.1	103.0	7.90	110.7	102.3	8.4	92.5	0.257	0.239	0.543	0.498	10.9	25.4	0.213	0.473	14.8	70.9	0.221	1	60-22	33-8	27-14	26-4	34-18	8.0	10.0	10.0
1	2003	Golden State Warriors	37.3	84.6	0.441	5.2	15.1	0.344	32.1	69.6	0.462	22.6	29.0	0.778	15.7	31.0	46.7	20.9	7.2	6.2	15.8	21.8	102.4	-0.60	108.3	109.5	-1.2	94.2	0.343	0.178	0.526	0.472	13.9	35.0	0.267	0.482	12.2	67.9	0.220	19	38-44	24-17	14-27	19-11	19-33	NaN	NaN	NaN
2	2003	Sacramento Kings	39.5	85.2	0.464	6.0	15.7	0.381	33.5	69.5	0.482	16.7	22.3	0.746	11.0	33.5	44.5	24.8	9.0	5.6	14.5	20.3	101.7	6.68	105.9	99.1	6.8	95.4	0.262	0.184	0.535	0.499	13.3	25.6	0.196	0.446	13.6	70.6	0.204	3	59-23	35-6	24-17	23-7	36-16	3.0	7.0	5.0
3	2003	Los Angeles Lakers	37.7	83.6	0.451	5.9	16.7	0.356	31.8	66.9	0.475	19.0	26.0	0.734	13.1	31.1	44.3	23.3	7.8	5.7	14.5	22.9	100.4	2.71	107.2	104.7	2.5	92.5	0.311	0.199	0.528	0.486	13.3	30.2	0.228	0.477	13.4	72.7	0.241	6	50-32	31-10	19-22	17-13	33-19	7.0	6.0	6.0
4	2003	Milwaukee Bucks	37.1	81.3	0.457	7.1	18.6	0.383	30.0	62.7	0.478	18.1	23.3	0.776	10.7	28.9	39.5	22.2	7.6	4.2	12.7	22.2	99.5	-0.24	108.8	108.6	0.2	90.4	0.287	0.229	0.543	0.500	12.2	25.6	0.222	0.494	13.5	69.9	0.237	16	42-40	25-16	17-24	32-22	10-18	11.0	2.0	4.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
25	2023	Orlando Magic	40.5	86.3	0.470	10.8	31.1	0.346	29.8	55.2	0.539	19.6	25.0	0.784	10.2	33.1	43.2	23.2	7.4	4.7	15.1	20.1	111.4	-2.39	111.6	114.2	-2.6	99.3	0.290	0.361	0.573	0.532	13.4	23.8	0.227	0.550	13.1	77.7	0.211	25	34-48	20-21	14-27	20-32	14-16	NaN	NaN	NaN
26	2023	Charlotte Hornets	41.3	90.4	0.457	10.7	32.5	0.330	30.5	57.9	0.528	17.6	23.6	0.749	11.0	33.5	44.5	25.1	7.7	5.2	14.2	20.3	111.0	-5.89	109.2	115.3	-6.1	100.8	0.261	0.360	0.550	0.516	12.3	23.8	0.195	0.544	12.5	75.5	0.211	27	27-55	13-28	14-27	15-37	12-18	NaN	NaN	NaN
27	2023	Houston Rockets	40.6	88.9	0.457	10.4	31.9	0.327	30.2	56.9	0.530	19.1	25.3	0.754	13.4	32.9	46.3	22.4	7.3	4.6	16.2	20.5	110.7	-7.62	111.4	119.3	-7.9	99.0	0.285	0.359	0.554	0.516	14.0	30.2	0.215	0.564	11.8	75.8	0.218	28	22-60	14-27	8-33	10-20	12-40	NaN	NaN	NaN
28	2023	Detroit Pistons	39.6	87.1	0.454	11.4	32.4	0.351	28.2	54.6	0.516	19.8	25.7	0.771	11.2	31.3	42.4	23.0	7.0	3.8	15.1	22.1	110.3	-7.73	110.7	118.9	-8.2	99.0	0.295	0.372	0.561	0.520	13.3	24.9	0.227	0.557	11.9	74.0	0.231	30	17-65	9-32	8-33	8-44	9-21	NaN	NaN	NaN
29	2023	Miami Heat	39.2	85.3	0.460	12.0	34.8	0.344	27.3	50.5	0.540	19.1	23.0	0.831	9.7	30.9	40.6	23.8	8.0	3.0	13.5	18.5	109.5	-0.13	113.0	113.3	-0.3	96.3	0.270	0.408	0.574	0.530	12.4	22.8	0.224	0.561	14.5	77.7	0.198	13	44-38	27-14	17-24	24-28	20-10	4.0	3.0	1.0

	Season	Team	Conference	Seed	FG	FGA	FG%	3P	3PA	3P%	2P	2PA	2P%	FT	FTA	FT%	ORB	DRB	TRB	AST	STL	BLK	TOV	PF	PTS	SRS	ORtg	DRtg	NRtg	Pace	FTr	3PAr	TS%	eFG%	TOV%	ORB%	FT/FGA	OppeFG%	OppTOV%	DRB%	OppFT/FGA	Rank	W%	HomeW%	RoadW%	EastW%	WestW%	ConferenceW%	PlayoffRank	PlayoffW	PlayoffL
0	2003	Dallas Mavericks	West	1	38.5	85.1	0.453	7.8	20.3	0.381	30.8	64.8	0.475	18.1	21.9	0.829	11.1	31.0	42.1	22.4	8.1	5.5	11.6	21.1	103.0	7.90	110.7	102.3	8.4	92.5	0.257	0.239	0.543	0.498	10.9	25.4	0.213	0.473	14.8	70.9	0.221	1	0.731707	0.804878	0.658537	0.866667	0.653846	0.653846	8	10	10
1	2003	Sacramento Kings	West	3	39.5	85.2	0.464	6.0	15.7	0.381	33.5	69.5	0.482	16.7	22.3	0.746	11.0	33.5	44.5	24.8	9.0	5.6	14.5	20.3	101.7	6.68	105.9	99.1	6.8	95.4	0.262	0.184	0.535	0.499	13.3	25.6	0.196	0.446	13.6	70.6	0.204	3	0.719512	0.853659	0.585366	0.766667	0.692308	0.692308	3	7	5
2	2003	Los Angeles Lakers	West	5	37.7	83.6	0.451	5.9	16.7	0.356	31.8	66.9	0.475	19.0	26.0	0.734	13.1	31.1	44.3	23.3	7.8	5.7	14.5	22.9	100.4	2.71	107.2	104.7	2.5	92.5	0.311	0.199	0.528	0.486	13.3	30.2	0.228	0.477	13.4	72.7	0.241	6	0.609756	0.756098	0.463415	0.566667	0.634615	0.634615	7	6	6
3	2003	Milwaukee Bucks	East	7	37.1	81.3	0.457	7.1	18.6	0.383	30.0	62.7	0.478	18.1	23.3	0.776	10.7	28.9	39.5	22.2	7.6	4.2	12.7	22.2	99.5	-0.24	108.8	108.6	0.2	90.4	0.287	0.229	0.543	0.500	12.2	25.6	0.222	0.494	13.5	69.9	0.237	16	0.512195	0.609756	0.414634	0.592593	0.357143	0.592593	11	2	4
4	2003	Orlando Magic	East	8	35.9	82.5	0.436	6.9	19.4	0.357	29.0	63.1	0.460	19.7	25.4	0.777	11.7	29.2	40.9	20.4	8.5	3.7	14.4	23.0	98.5	-0.39	105.2	105.0	0.2	93.1	0.307	0.235	0.526	0.478	13.3	27.0	0.239	0.486	15.1	71.1	0.250	17	0.512195	0.634146	0.390244	0.574074	0.392857	0.574074	15	3	4
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
331	2023	Phoenix Suns	West	4	42.1	90.1	0.467	12.2	32.6	0.374	29.9	57.5	0.520	17.2	21.7	0.793	11.8	32.4	44.2	27.3	7.1	5.3	13.5	21.2	113.6	2.08	115.1	113.0	2.1	98.2	0.241	0.362	0.570	0.535	12.0	26.6	0.191	0.532	12.9	76.0	0.234	10	0.548780	0.682927	0.414634	0.500000	0.576923	0.576923	3	4	1
332	2023	Los Angeles Clippers	West	6	41.1	86.1	0.477	12.7	33.4	0.381	28.4	52.7	0.539	18.7	23.9	0.781	9.8	33.4	43.2	23.9	7.1	4.4	14.2	19.5	113.6	0.31	115.0	114.5	0.5	98.0	0.278	0.387	0.588	0.551	12.8	22.9	0.217	0.543	11.7	76.6	0.195	12	0.536585	0.560976	0.512195	0.566667	0.519231	0.519231	14	1	4
333	2023	Brooklyn Nets	East	6	41.5	85.1	0.487	12.8	33.8	0.378	28.7	51.3	0.559	17.7	22.1	0.800	8.2	32.3	40.5	25.5	7.1	6.2	13.7	21.1	113.4	1.03	115.0	114.1	0.9	98.3	0.260	0.397	0.598	0.562	12.7	19.6	0.208	0.530	12.2	73.7	0.212	9	0.548780	0.560976	0.536585	0.576923	0.500000	0.576923	16	0	4
334	2023	Cleveland Cavaliers	East	4	41.6	85.2	0.488	11.6	31.6	0.367	30.0	53.6	0.559	17.5	22.5	0.780	9.7	31.4	41.1	24.9	7.1	4.7	13.3	19.0	112.3	5.23	116.1	110.6	5.5	95.7	0.264	0.371	0.590	0.556	12.3	23.6	0.206	0.535	14.4	76.3	0.210	5	0.621951	0.756098	0.487805	0.653846	0.566667	0.653846	10	1	3
335	2023	Miami Heat	East	7	39.2	85.3	0.460	12.0	34.8	0.344	27.3	50.5	0.540	19.1	23.0	0.831	9.7	30.9	40.6	23.8	8.0	3.0	13.5	18.5	109.5	-0.13	113.0	113.3	-0.3	96.3	0.270	0.408	0.574	0.530	12.4	22.8	0.224	0.561	14.5	77.7	0.198	13	0.536585	0.658537	0.414634	0.461538	0.666667	0.461538	4	3	1

	Model	R2	RMSE	MAE
0	Linear Regression	0.268811	3.997359	3.266602
1	Ridge Regression	0.445865	3.479894	2.684129
2	Lasso Regression	0.415087	3.575229	2.783280
3	Elastic Net Regression	0.408355	3.595744	2.808841
4	Decision Tree Regression	0.526508	3.216729	2.381250
5	Random Forest Regression	0.436751	3.508393	2.628015
6	Gradient Boosting Regression	0.478939	3.374446	2.606227
7	AdaBoost Regression	0.484823	3.355339	2.521633

	Team	Conference	Seed	PredPlayoffW
0	Boston Celtics	East	2	11.000000
1	Memphis Grizzlies	West	2	11.000000
2	Milwaukee Bucks	East	1	10.161290
3	Denver Nuggets	West	1	10.161290
4	Sacramento Kings	West	3	6.838710
5	Philadelphia 76ers	East	3	6.838710
6	Phoenix Suns	West	4	5.066667
7	Cleveland Cavaliers	East	4	5.066667
8	Golden State Warriors	West	5	4.000000
9	New York Knicks	East	5	4.000000
10	Los Angeles Clippers	West	6	2.566667
11	Brooklyn Nets	East	6	2.566667
12	Atlanta Hawks	East	8	1.296875
13	Los Angeles Lakers	West	7	1.296875
14	Minnesota Timberwolves	West	8	1.296875
15	Miami Heat	East	7	1.296875

NBA Playoff Predictions

Neel Shah

1. Introduction¶

1.1. Background Information¶

1.2. Objective¶

1.3. Configuration¶

2. Data Collection¶

2.1. Data Sources¶

2.2. Scraping Data¶

3. Data Processing¶

3.1. Cleaning Data¶

3.1.1. Renaming a Single Column¶

3.1.2. Renaming All Columns¶

3.1.3. Cleaning a CSV File¶

3.2. Merging Data¶

3.3. Filtering Rows¶

3.4. Setting Data Types¶

3.5. Adding New Columns¶

3.5.1. Determining Conferences¶

3.5.2. Determining Playoff Seeding¶

3.5.3. Converting Team Records to Winning Percentages¶

3.6. Labeling Columns¶

3.7. Looking at Tidy Data¶

4. Exploratory Data Analysis & Visualization¶

4.1. Setup¶

4.1.1. Splitting Data¶

4.1.2. Making Subplots¶

4.2. Correlating Individual Statistics With Playoff Wins¶

4.2.1. Overall Rating¶

4.2.2. Offensive Volume¶

4.2.3. Offensive Efficiency¶

4.2.4. Rebounding¶

4.2.5. Passing/Turnovers¶

4.2.6. Defensive¶

4.2.7. Remarks¶

4.3. The Efficiency Landscape¶

4.4. Combining Common Predictors: Seed and True Shooting Percentage¶

5. Modeling: Analysis, Hypothesis Testing, & Machine Learning¶

5.1. Problem Definition¶

5.2. Splitting Data¶

5.3. Machine Learning Pipelines¶

5.3.1. Preprocessing Data¶

5.3.2. Feature Selection / Dimensionality Reduction¶

5.3.3. Model Selection / Hyperparameter Tuning¶

5.4. Evaluating Models¶

5.4.1. Comparing Models¶

5.4.2. Diving Deeper¶

5.4.3. Predicting the $2023$ NBA Playoffs¶

6. Interpretation: Insight & Policy Decision¶

1. Introduction ¶

1.1. Background Information ¶

1.2. Objective ¶

1.3. Configuration ¶

2. Data Collection ¶

2.1. Data Sources ¶

2.2. Scraping Data ¶

3. Data Processing ¶

3.1. Cleaning Data ¶

3.1.1. Renaming a Single Column ¶

3.1.2. Renaming All Columns ¶

3.1.3. Cleaning a `CSV` File ¶

3.2. Merging Data ¶

3.3. Filtering Rows ¶

3.4. Setting Data Types ¶

3.5. Adding New Columns ¶

3.5.1. Determining Conferences ¶

3.5.2. Determining Playoff Seeding ¶

3.5.3. Converting Team Records to Winning Percentages ¶

3.6. Labeling Columns ¶

3.7. Looking at Tidy Data ¶

4. Exploratory Data Analysis & Visualization ¶

4.1. Setup ¶

4.1.1. Splitting Data ¶

4.1.2. Making Subplots ¶

4.2. Correlating Individual Statistics With Playoff Wins ¶

4.2.1. Overall Rating ¶

4.2.2. Offensive Volume ¶

4.2.3. Offensive Efficiency ¶

4.2.4. Rebounding ¶

4.2.5. Passing/Turnovers ¶

4.2.6. Defensive ¶

4.2.7. Remarks ¶

4.3. The Efficiency Landscape ¶

4.4. Combining Common Predictors: Seed and True Shooting Percentage ¶

5. Modeling: Analysis, Hypothesis Testing, & Machine Learning ¶

5.1. Problem Definition ¶

5.2. Splitting Data ¶

5.3. Machine Learning Pipelines ¶

5.3.1. Preprocessing Data ¶

5.3.2. Feature Selection / Dimensionality Reduction ¶

5.3.3. Model Selection / Hyperparameter Tuning ¶

5.4. Evaluating Models ¶

5.4.1. Comparing Models ¶

5.4.2. Diving Deeper ¶

5.4.3. Predicting the $2023$ NBA Playoffs ¶

6. Interpretation: Insight & Policy Decision ¶