Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions docs/boxes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# boxes

`boxes(team: str, date: str)`

Get Baseball Reference's box score data for a particular game.

## Arguments
`team` String. The team name abbrevation format.
`date` String. The date of the game which we want to find the box score.
`doubleheader` Integer. 0 for the first game of the day, and 1 for the second game. Default is 0.

## Examples of valid queries

```python
from pybaseball import boxes

team = 'DET'
date = '2010-07-19'

boxes(team, date)

```

1 change: 1 addition & 0 deletions pybaseball/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .playerid_lookup import chadwick_register
from .teamid_lookup import fangraphs_teams
from .teamid_lookup import team_ids
from .boxes import boxes
from .statcast import statcast, statcast_single_game
from .statcast_pitcher import (
statcast_pitcher,
Expand Down
71 changes: 71 additions & 0 deletions pybaseball/boxes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from datetime import date

import pandas as pd
from bs4 import BeautifulSoup

from .utils import sanitize_date_range
from .datasources.bref import BRefSession

session = BRefSession()


def get_soup(team: str, start_dt: date, double_header: int = 0) -> BeautifulSoup:
# get most recent standings if date not specified
# if((start_dt is None) or (end_dt is None)):
# print('Error: a date range needs to be specified')
# return None
# https://www.baseball-reference.com/boxes/DET/DET201007190.shtml
url = "http://www.baseball-reference.com/boxes/{}/{}{}{}.shtml".format(team, team, start_dt.strftime('%Y%m%d'), double_header)
s = session.get(url).content
# a workaround to avoid beautiful soup applying the wrong encoding
s = s.decode('utf-8')
return BeautifulSoup(s, features="lxml")

def extract_line_score(data):
return {
'team': data[1],
'line_score': ''.join(data[2:-4]),
'runs': data[-4],
'hits': data[-3],
'errors': data[-2],
}


def get_table(soup: BeautifulSoup) -> pd.DataFrame:
table = soup.find_all('table')[0]
data = []
headings = [th.get_text() for th in table.find("tr").find_all("th")][1:]
headings.append("mlbID")
data.append(headings)
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
row_anchor = row.find("a")
mlbid = row_anchor["href"].split("mlb_ID=")[-1] if row_anchor else pd.NA # ID str or nan
cols = [ele.text.strip() for ele in cols]
cols.append(mlbid)
data.append([ele for ele in cols])
data = [extract_line_score(d) for d in data]
df = pd.DataFrame(data)
df = df.reindex(df.index.drop(0))
return df


def boxes(team: str, date: str, double_header: int = 0) -> pd.DataFrame:
"""
Get all batting stats for a set time range. This can be the past week, the
month of August, anything. Just supply the start and end date in YYYY-MM-DD
format.
"""
# make sure date inputs are valid
game_date, end_dt_date = sanitize_date_range(date, date)
if game_date.year < 2008:
raise ValueError("Year must be 2008 or later")
if end_dt_date.year < 2008:
raise ValueError("Year must be 2008 or later")
# retrieve html from baseball reference
soup = get_soup(team, game_date, double_header)
table = get_table(soup)
table = table.dropna(how='all') # drop if all columns are NA
return table
22 changes: 11 additions & 11 deletions pybaseball/retrosheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,12 @@
from github.GithubException import RateLimitExceededException
import warnings

EVENT_TYPES = {
'regular': ('.EVA','.EVN'),
'post': ('CS.EVE','D1.EVE','D2.EVE','W1.EVE','W2.EVE','WS.EVE'),
'asg': ('AS.EVE')
}

gamelog_columns = [
'date', 'game_num', 'day_of_week', 'visiting_team',
'visiting_team_league', 'visiting_team_game_num', 'home_team',
Expand Down Expand Up @@ -107,9 +113,9 @@
roster_url = 'https://raw.githubusercontent.com/chadwickbureau/retrosheet/master/seasons/{}/{}{}.ROS'
event_url = 'https://raw.githubusercontent.com/chadwickbureau/retrosheet/master/seasons/{}/{}'

def events(season, type='regular', export_dir='.'):
def events(season, kind='regular', export_dir='.'):
"""
Pulls retrosheet event files for an entire season. The `type` argument
Pulls retrosheet event files for an entire season. The `kind` argument
specifies whether to pull regular season, postseason or asg files. Valid
arguments are 'regular', 'post', and 'asg'.

Expand All @@ -119,14 +125,8 @@ def events(season, type='regular', export_dir='.'):
GH_TOKEN=os.getenv('GH_TOKEN', '')
if not os.path.exists(export_dir):
os.mkdir(export_dir)

match type:
case 'regular':
file_extension = ('.EVA','.EVN')
case 'post':
file_extension = ('CS.EVE','D1.EVE','D2.EVE','W1.EVE','W2.EVE','WS.EVE')
case 'asg':
file_extension = ('AS.EVE')

file_extension = EVENT_TYPES.get(kind)

try:
g = Github(GH_TOKEN)
Expand Down Expand Up @@ -215,7 +215,7 @@ def schedules(season):
repo = g.get_repo('chadwickbureau/retrosheet')
season_folder = [f.path[f.path.rfind('/')+1:] for f in repo.get_contents(f'seasons/{season}')]
file_name = f'{season}schedule.csv'

if file_name not in season_folder:
raise ValueError(f'Schedule not available for {season}')
s = get_text_file(schedule_url.format(season, season))
Expand Down