Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions pybaseball/playoff_odds.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from io import StringIO
from typing import List, Optional

import pandas as pd
from bs4 import BeautifulSoup, Comment, PageElement, ResultSet

from . import cache
from .utils import most_recent_season
from .datasources.bref import BRefSession

session = BRefSession()

def playoff_odds(season):
if season < 2020:
print(f"Playoff odds not found for {season}")
return None

session = BRefSession()
url = f'https://www.baseball-reference.com/leagues/majors/{season}-playoff-odds.shtml'
s = session.get(url).content
soup = BeautifulSoup(s, "lxml")

# Find the specific table by id
table = soup.find('table', {'id': 'playoff_prob_mlb'})
if table is None:
print(f"Table with id 'playoff_prob_mlb' not found for season {season}")
return None

# HTML → DataFrame
df = pd.read_html(StringIO(str(table)))[0]

# DataFrame → JSON string
json_str = df.to_json(orient="records")

# JSON string → DataFrame
new_df = pd.read_json(StringIO(json_str))

cols = [
('Unnamed: 1_level_0', 'Tm'),
('Unnamed: 2_level_0', 'Lg'),
('Unnamed: 20_level_0', 'WC'),
('Unnamed: 21_level_0', 'Div'),
('Unnamed: 22_level_0', 'LDS'),
('Unnamed: 23_level_0', 'LCS'),
('Unnamed: 24_level_0', 'Pennant'),
('Unnamed: 25_level_0', 'Win WS')
]

df_filtered = df[cols].dropna()

# Remove division header rows
division_names = [
'NL East', 'NL Central', 'NL West',
'AL East', 'AL Central', 'AL West', 'Tm'
]

mask = ~df_filtered.iloc[:, 0].isin(division_names)
new_df = df_filtered[mask].reset_index(drop=True)

# Clean column names
new_df.columns = ['Tm', 'Lg', 'WC', 'Div', 'LDS', 'LCS', 'Pennant', 'Win WS']

return new_df


26 changes: 26 additions & 0 deletions tests/integration/pybaseball/test_playoff_odds.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from time import sleep
from typing import Generator, Optional

import pandas as pd
import pytest

from pybaseball.playoff_odds import playoff_odds
from pybaseball.utils import most_recent_season

@pytest.fixture(autouse=True)
def before_after_each() -> Generator[None, None, None]:
# before each test
yield
# after each test
sleep(6) # BBRef will throttle us if we make more than 10 calls per minute

class TestBRefPlayoffOdds:
@pytest.mark.parametrize(
"season", [2024] # Changed to test only 2024
)
def test_odds(self, season: Optional[int]) -> None:
season_playoff_odds = playoff_odds(season)
assert season_playoff_odds is not None
assert season_playoff_odds is not None
assert len(season_playoff_odds.columns) > 0
assert len(season_playoff_odds.index) > 0