Skip to content
Merged
Show file tree
Hide file tree
Changes from 30 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
340fcec
add python-dotenv
K-Meech Mar 25, 2026
dd6a1dd
add function to fetch data from the pyrat api
K-Meech Mar 25, 2026
49b3e52
remove un-needed dependency
K-Meech Mar 25, 2026
b097b59
add function to retreive species id from the pyrat api
K-Meech Mar 26, 2026
d44d849
add filtering by line name
K-Meech Mar 26, 2026
066190c
add filtering by date of birth
K-Meech Mar 26, 2026
477dcc8
change payload to params and expand docstrings
K-Meech Mar 26, 2026
148feae
merge father and mother ids into the main table
K-Meech Mar 26, 2026
11fc462
merge upstream changes
K-Meech Mar 27, 2026
14e3c92
move pyrat files into sub-module
K-Meech Mar 27, 2026
ff02168
refactor mutations expansion into its own function
K-Meech Mar 27, 2026
2dc85f5
separate expanding parent column into another function
K-Meech Mar 27, 2026
f6e573e
working conversion of api info to pandas dataframe
K-Meech Mar 27, 2026
4a7b257
expand docstrings for pyRAT api functions
K-Meech Mar 27, 2026
005df51
rename species_name to Species
K-Meech Mar 27, 2026
63ff8b9
draft of allowing large responses to be split into many
K-Meech Apr 8, 2026
081c9bf
mock father and mother api responses
K-Meech Apr 8, 2026
9ebc90a
allow parameterization of pyrat api test
K-Meech Apr 9, 2026
d9fec3a
allow fetching any pooch datset with a common function
K-Meech Apr 9, 2026
2223594
test get_pyrat_data response against expected csv
K-Meech Apr 9, 2026
5626037
handle no animal data returned from api
K-Meech Apr 9, 2026
21fd4b7
add test for multiple api responses
K-Meech Apr 9, 2026
0a05ddd
pull json response info directly from pooch
K-Meech Apr 9, 2026
bc70351
handle some missing parent data in responses
K-Meech Apr 9, 2026
4c2d1ce
handle no listed parents or mutations in responses
K-Meech Apr 9, 2026
798ad9a
handle some entries missing one parent id
K-Meech Apr 9, 2026
59ff8bb
handle all returned individuals missing one parent
K-Meech Apr 9, 2026
8e6ff76
update get_pyrat_data args to match new test data
K-Meech Apr 9, 2026
d921a63
add test for invalid dates and species names
K-Meech Apr 13, 2026
b5230f3
update pyrat api docstrings
K-Meech Apr 13, 2026
1fb361e
add colony_management sub-module
K-Meech Apr 13, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,4 @@ venv/
# test data and temporary files
/test_data
/tmp
.env
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ repos:
- id: mypy
additional_dependencies:
- types-setuptools
- types-requests
- repo: https://github.com/mgedmin/check-manifest
rev: "0.50"
hooks:
Expand Down
Empty file added oscar/pyrat/__init__.py
Empty file.
368 changes: 368 additions & 0 deletions oscar/pyrat/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,368 @@
import datetime
import os
from typing import Any, Iterator

import pandas as pd
import requests


def get_pyrat_data(
line_name: str | None = None,
species_name: str | None = None,
birth_date_from: datetime.date | None = None,
birth_date_to: datetime.date | None = None,
max_n_rows: int = 10000,
) -> Iterator[pd.DataFrame]:
"""Fetch animal data directly from the pyRAT api.

To handle the potentially large number of animals returned from pyRAT,
this function returns a generator of pandas dataframes (each with no
more than max_n_rows).

This expects PYRAT_URL, PYRAT_CLIENT_TOKEN and PYRAT_USER_TOKEN to
be set as environment variables.

Parameters
----------
line_name : str | None, optional
Name of line to fetch
species_name : str | None, optional
Name of species to fetch
birth_date_from : datetime.date | None, optional
Earliest birth date to include
birth_date_to : datetime.date | None, optional
Latest birth date to include
max_no_rows : int, optional
Maximum number of items in each returned dataframe (and therefore
returned per request to the pyRAT api)

Returns
-------
Iterator[pd.DataFrame]
Generator of dataframes of returned animal data, in format matching
that exported via the pyRAT UI. If no data is available for the query,
the dataframe will be empty.
"""
if (birth_date_to is not None and birth_date_from is not None) and (
birth_date_to < birth_date_from
):
raise ValueError("birth_date_to must be after birth_date_from")

params = {
"k": [
"animalid",
"eartag_or_id",
"species_name",
"strain_name",
"dateborn",
"mutations",
"parents",
"sacrifice_reason_name",
],
"s": ["eartag_or_id:asc"],
"state": ["live", "sacrificed", "exported"],
"l": max_n_rows,
}

if line_name is not None:
params["strain_name_with_id_like"] = line_name

if species_name is not None:
params["species"] = _get_species_id(species_name)

if birth_date_from is not None:
params["birth_date_from"] = birth_date_from.isoformat()

if birth_date_to is not None:
params["birth_date_to"] = birth_date_to.isoformat()

# Make one request to determine how many results there are
animals_response = _make_pyrat_request("animals", params)
yield _convert_animals_to_df(animals_response.json())
headers = animals_response.headers
total_n = int(headers["x-total-count"])

# If more results than max_n_rows, keep making requests and yielding result
for start_n in range(max_n_rows, total_n, max_n_rows):
params["o"] = start_n
animals_response = _make_pyrat_request("animals", params)
yield _convert_animals_to_df(animals_response.json())


def _make_pyrat_request(
endpoint_name: str, params: dict[str, Any]
) -> requests.Response:
"""Make request to the pyRAT api.

This expects PYRAT_URL, PYRAT_CLIENT_TOKEN and PYRAT_USER_TOKEN to
be set as environment variables.

Parameters
----------
endpoint_name : str
Name of endpoint e.g. 'species'
params : dict[str, Any]
Extra parameters to pass to the endpoint

Returns
-------
requests.Response
The requests response object, containing data from pyRAT
"""
response = requests.get(
url=f"{os.environ['PYRAT_URL']}/api/v3/{endpoint_name}",
auth=(
os.environ["PYRAT_CLIENT_TOKEN"],
os.environ["PYRAT_USER_TOKEN"],
),
params=params,
timeout=5, # number of seconds before timeout
)

# If the request didn't succeed, raise an error containing the status
# code
response.raise_for_status()

return response


def _get_species_id(species_name: str) -> int:
"""Get pyRAT database ID for named species"""

params = {
"k": ["id", "name"],
"s": ["name:asc"],
}
species_ids = _make_pyrat_request("species", params).json()

available_names = []
for species in species_ids:
if species["name"] == species_name:
return species["id"]
else:
available_names.append(species["name"])

raise ValueError(
f"No ID found for species {species_name}: available values "
f"are {available_names}"
)


def _get_mutations_for_eartags(eartags: list[str]) -> pd.DataFrame:
"""Get mutation information for the given animal eartags"""

params = {
"k": ["animalid", "eartag_or_id", "mutations"],
"s": ["eartag_or_id:asc"],
"state": ["live", "sacrificed", "exported"],
"eartag": eartags,
"l": len(eartags),
}
mutation_data = _make_pyrat_request("animals", params).json()

if len(mutation_data) != len(eartags):
raise ValueError(
f"{len(mutation_data)} animals returned for "
f"{len(eartags)} eartags: {eartags}"
)

return pd.DataFrame(mutation_data)


def _convert_animals_to_df(animals_data: list[dict[str, Any]]) -> pd.DataFrame:
"""Convert animal data fetched from the pyRAT api to a pandas DataFrame.

The structure / column names are matched to that exported from the pyRAT
UI.
"""

animals_df = pd.DataFrame(animals_data)
if animals_df.empty:
return animals_df

# Convert dateborn to Year-Month-Day format (removing time info)
new_dates = pd.to_datetime(animals_df.dateborn).dt.strftime("%Y-%m-%d")
animals_df.dateborn = new_dates

# Expand column with information for multiple mutations into their own
# columns
animals_df = _expand_mutations_data(animals_df)

# Expand column with information about multiple parents into a new
# dataframe, including their mutation info
parents_df = _expand_parents_data(animals_df)
animals_df = animals_df.drop(["parents"], axis=1)
animals_df = animals_df.merge(parents_df, on="animalid", how="left")

animals_df = animals_df.drop(["animalid"], axis=1)

# re-name to match data exported via the pyRAT UI, to make downstream
# analysis easier
animals_df = animals_df.rename(
columns={
"eartag_or_id": "ID",
"sacrifice_reason_name": "Sacrifice reason",
"dateborn": "DOB",
"strain_name": "Line / Strain (Name)",
"species_name": "Species",
}
)

return animals_df


def _expand_mutations_data(
animals_df: pd.DataFrame, column_prefix: str = ""
) -> pd.DataFrame:
"""Expand a mutations column into a full dataframe.

Each row of a mutations column contains a list of dictionaries
(one per mutation for the animal). This function expands these into
their own columns labelled Mutation 1, 2... and Grade 1, 2...

Parameters
----------
animals_df : pd.DataFrame
DataFrame of animals data, with raw mutations column
column_prefix: str
Prefix to add to expanded column names e.g. a prefix of 'Father: '
would result in columns Father: Mutation 1, Father: Grade 1 etc.

Returns
-------
pd.DataFrame
Dataframe with separate Mutation and Grade columns
"""

exploded_mutations_col = animals_df.mutations.explode()
mutations_df = pd.DataFrame(
exploded_mutations_col[~exploded_mutations_col.isna()].tolist()
)

mutation_col_name = f"{column_prefix}Mutation"
grade_col_name = f"{column_prefix}Grade"

# If no mutations are listed for any animals, return an empty Mutation 1 /
# Grade 1 column
if mutations_df.empty:
animals_df = animals_df.drop(["mutations"], axis=1)
animals_df[f"{mutation_col_name} 1"] = pd.Series(dtype=str)
animals_df[f"{grade_col_name} 1"] = pd.Series(dtype=str)
return animals_df

mutations_df = mutations_df[["animalid", "mutationname", "mutationgrade"]]
mutations_df = mutations_df.rename(
columns={
"mutationname": mutation_col_name,
"mutationgrade": grade_col_name,
}
)

# Adds a counter for the number of mutation rows per animal id
mutations_df["count"] = (
mutations_df.groupby("animalid").cumcount() + 1
).astype("string")

# Create one row per animalid, with separate columns for
# Mutation 1 / Grade 1, Mutation 2 / Grade 2 ...
pivoted_mutations = mutations_df.pivot(
columns="count",
index="animalid",
values=[mutation_col_name, grade_col_name],
).reset_index()
pivoted_mutations.columns = [
" ".join(column_names).strip()
for column_names in pivoted_mutations.columns.to_flat_index()
]

# merge into the original animals_df, so animalids are in the same order,
# and any animals with no mutations appear with NaN in the correct slots
merged_df = animals_df.drop(["mutations"], axis=1)
merged_df = merged_df.merge(pivoted_mutations, on="animalid", how="left")

return merged_df


def _add_empty_parent_cols(df: pd.DataFrame, parent: str) -> None:
"""Add empty columns for parent mutation and grade"""

df[parent] = pd.Series(dtype=str)
df[f"{parent}: Mutation 1"] = pd.Series(dtype=str)
df[f"{parent}: Grade 1"] = pd.Series(dtype=str)


def _expand_parents_data(animals_df: pd.DataFrame) -> pd.DataFrame:
"""Expand column containing multiple parents' information into separate
columns.

This adds columns for Mother / Father ID, as well as their respective
mutations.
"""

exploded_parents_col = animals_df.parents.explode()
parents_df = pd.DataFrame(
exploded_parents_col[~exploded_parents_col.isna()].tolist()
)

# If no parents are listed for any animals, return empty mother / father
# columns, with empty mutation / grade
if parents_df.empty:
animals_df = animals_df.loc[:, ["animalid"]]
_add_empty_parent_cols(animals_df, "Mother")
_add_empty_parent_cols(animals_df, "Father")
return animals_df

# Create dataframe with one row per animalid, and one column each for
# ID of mother and father
expanded_df = parents_df[["animalid", "parent_eartag", "parent_sex"]]
expanded_df = expanded_df.pivot(
columns="parent_sex", values="parent_eartag", index="animalid"
)
expanded_df = expanded_df.reset_index().rename_axis(None, axis=1)
expanded_df = expanded_df.rename(columns={"f": "Mother", "m": "Father"})

# Fetch mutation info for all parents and merge
for parent in ["Mother", "Father"]:
if parent in expanded_df:
parent_df = _get_mutations_for_parent(expanded_df, parent)
expanded_df = expanded_df.merge(parent_df, on=parent, how="left")
else:
_add_empty_parent_cols(expanded_df, parent)

# merge into the original animals_df, so animalids are in the same order,
# and any animals with no listed parents appear with NaN in the correct
# slots
merged_df = animals_df.loc[:, ["animalid"]]
merged_df = merged_df.merge(expanded_df, on="animalid", how="left")

return expanded_df


def _get_mutations_for_parent(
parents_df: pd.DataFrame, parent: str
) -> pd.DataFrame:
"""Return a dataframe with mutations for all unique parent IDs.

Parameters
----------
parents_df : pd.DataFrame
Dataframe with animalid and 'parent' column
parent : str
Name of column of parent ids

Returns
-------
pd.DataFrame
Dataframe with parent IDs and mutation / grade columns
"""

mutations_df = _get_mutations_for_eartags(
parents_df[parent].dropna().unique().tolist()
)
mutations_df = _expand_mutations_data(
mutations_df, column_prefix=f"{parent}: "
)
mutations_df = mutations_df.drop(["animalid"], axis=1)
mutations_df = mutations_df.rename(columns={"eartag_or_id": parent})

return mutations_df
File renamed without changes.
Loading
Loading