jldbc · earlonrails · Jun 2, 2024 · Jun 6, 2024
diff --git a/docs/boxes.md b/docs/boxes.md
@@ -0,0 +1,23 @@
+# boxes
+
+`boxes(team: str, date: str)`
+
+Get Baseball Reference's box score data for a particular game.
+
+## Arguments
+`team` String. The team name abbrevation format.
+`date` String. The date of the game which we want to find the box score.
+`doubleheader` Integer. 0 for the first game of the day, and 1 for the second game. Default is 0.
+
+## Examples of valid queries
+
+```python
+from pybaseball import boxes
+
+team = 'DET'
+date = '2010-07-19'
+
+boxes(team, date)
+
+```
+
diff --git a/pybaseball/__init__.py b/pybaseball/__init__.py
@@ -5,6 +5,7 @@
 from .playerid_lookup import chadwick_register
 from .teamid_lookup import fangraphs_teams
 from .teamid_lookup import team_ids
+from .boxes import boxes
 from .statcast import statcast, statcast_single_game
 from .statcast_pitcher import (
 	statcast_pitcher,

diff --git a/pybaseball/boxes.py b/pybaseball/boxes.py
@@ -0,0 +1,71 @@
+from datetime import date
+
+import pandas as pd
+from bs4 import BeautifulSoup
+
+from .utils import sanitize_date_range
+from .datasources.bref import BRefSession
+
+session = BRefSession()
+
+
+def get_soup(team: str, start_dt: date, double_header: int = 0) -> BeautifulSoup:
+    # get most recent standings if date not specified
+    # if((start_dt is None) or (end_dt is None)):
+    #    print('Error: a date range needs to be specified')
+    #    return None
+    # https://www.baseball-reference.com/boxes/DET/DET201007190.shtml
+    url = "http://www.baseball-reference.com/boxes/{}/{}{}{}.shtml".format(team, team, start_dt.strftime('%Y%m%d'), double_header)
+    s = session.get(url).content
+    # a workaround to avoid beautiful soup applying the wrong encoding
+    s = s.decode('utf-8')
+    return BeautifulSoup(s, features="lxml")
+
+def extract_line_score(data):
+    return {
+        'team': data[1],
+        'line_score': ''.join(data[2:-4]),
+        'runs': data[-4],
+        'hits': data[-3],
+        'errors': data[-2],
+    }
+
+
+def get_table(soup: BeautifulSoup) -> pd.DataFrame:
+    table = soup.find_all('table')[0]
+    data = []
+    headings = [th.get_text() for th in table.find("tr").find_all("th")][1:]
+    headings.append("mlbID")
+    data.append(headings)
+    table_body = table.find('tbody')
+    rows = table_body.find_all('tr')
+    for row in rows:
+        cols = row.find_all('td')
+        row_anchor = row.find("a")
+        mlbid = row_anchor["href"].split("mlb_ID=")[-1] if row_anchor else pd.NA  # ID str or nan
+        cols = [ele.text.strip() for ele in cols]
+        cols.append(mlbid)
+        data.append([ele for ele in cols])
+    data = [extract_line_score(d) for d in data]
+    df = pd.DataFrame(data)
+    df = df.reindex(df.index.drop(0))
+    return df
+
+
+def boxes(team: str, date: str, double_header: int = 0) -> pd.DataFrame:
+    """
+    Get all batting stats for a set time range. This can be the past week, the
+    month of August, anything. Just supply the start and end date in YYYY-MM-DD
+    format.
+    """
+    # make sure date inputs are valid
+    game_date, end_dt_date = sanitize_date_range(date, date)
+    if game_date.year < 2008:
+        raise ValueError("Year must be 2008 or later")
+    if end_dt_date.year < 2008:
+        raise ValueError("Year must be 2008 or later")
+    # retrieve html from baseball reference
+    soup = get_soup(team, game_date, double_header)
+    table = get_table(soup)
+    table = table.dropna(how='all')  # drop if all columns are NA
+    return table
diff --git a/pybaseball/retrosheet.py b/pybaseball/retrosheet.py
@@ -30,6 +30,12 @@
 from github.GithubException import RateLimitExceededException
 import warnings
 
+EVENT_TYPES = {
+    'regular': ('.EVA','.EVN'),
+    'post': ('CS.EVE','D1.EVE','D2.EVE','W1.EVE','W2.EVE','WS.EVE'),
+    'asg': ('AS.EVE')
+}
+
 gamelog_columns = [
     'date', 'game_num', 'day_of_week', 'visiting_team',
     'visiting_team_league', 'visiting_team_game_num', 'home_team',
@@ -107,9 +113,9 @@
 roster_url = 'https://raw.githubusercontent.com/chadwickbureau/retrosheet/master/seasons/{}/{}{}.ROS'
 event_url = 'https://raw.githubusercontent.com/chadwickbureau/retrosheet/master/seasons/{}/{}'
 
-def events(season, type='regular', export_dir='.'):
+def events(season, kind='regular', export_dir='.'):
     """
-    Pulls retrosheet event files for an entire season. The `type` argument
+    Pulls retrosheet event files for an entire season. The `kind` argument
     specifies whether to pull regular season, postseason or asg files. Valid
     arguments are 'regular', 'post', and 'asg'.
 
@@ -119,14 +125,8 @@ def events(season, type='regular', export_dir='.'):
     GH_TOKEN=os.getenv('GH_TOKEN', '')
     if not os.path.exists(export_dir):
         os.mkdir(export_dir)
-
-    match type:
-        case 'regular':
-            file_extension = ('.EVA','.EVN')
-        case 'post':
-            file_extension = ('CS.EVE','D1.EVE','D2.EVE','W1.EVE','W2.EVE','WS.EVE')
-        case 'asg':
-            file_extension = ('AS.EVE')
+
+    file_extension = EVENT_TYPES.get(kind)
 
     try:
         g = Github(GH_TOKEN)
@@ -215,7 +215,7 @@ def schedules(season):
     repo = g.get_repo('chadwickbureau/retrosheet')
     season_folder = [f.path[f.path.rfind('/')+1:] for f in repo.get_contents(f'seasons/{season}')]
     file_name = f'{season}schedule.csv'
-    
+
     if file_name not in season_folder:
         raise ValueError(f'Schedule not available for {season}')
     s = get_text_file(schedule_url.format(season, season))