Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
193 changes: 162 additions & 31 deletions oscar/colony_management/pyrat/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,22 +148,22 @@ def _get_species_id(species_name: str) -> int:
)


def _get_mutations_for_eartags(eartags: list[str]) -> pd.DataFrame:
def _get_mutations_for_eartags(all_eartags: list[str]) -> pd.DataFrame:
"""Get mutation information for the given animal eartags"""

params = {
"k": ["animalid", "eartag_or_id", "mutations"],
"s": ["eartag_or_id:asc"],
"state": ["live", "sacrificed", "exported"],
"eartag": eartags,
"l": len(eartags),
"eartag": all_eartags,
"l": len(all_eartags),
}
mutation_data = _make_pyrat_request("animals", params).json()

if len(mutation_data) != len(eartags):
if len(mutation_data) != len(all_eartags):
raise ValueError(
f"{len(mutation_data)} animals returned for "
f"{len(eartags)} eartags: {eartags}"
f"{len(all_eartags)} eartags: {all_eartags}"
)

return pd.DataFrame(mutation_data)
Expand Down Expand Up @@ -312,57 +312,188 @@ def _expand_parents_data(animals_df: pd.DataFrame) -> pd.DataFrame:
_add_empty_parent_cols(animals_df, "Father")
return animals_df

# Create dataframe with one row per animalid, and one column each for
# Create new column for M & F
# numbered for how many times they appear for a specific animal id
parents_df["parent_sex_n"] = parents_df.groupby(
["animalid", "parent_sex"]
).cumcount()
parents_df["parent_sex_n"] = parents_df["parent_sex"] + parents_df[
"parent_sex_n"
].astype(str)

# Create dataframe with one row per animalid, and one column for
# ID of mother and father
expanded_df = parents_df[["animalid", "parent_eartag", "parent_sex"]]
expanded_df = parents_df[["animalid", "parent_eartag", "parent_sex_n"]]
expanded_df = expanded_df.pivot(
columns="parent_sex", values="parent_eartag", index="animalid"
columns="parent_sex_n", values="parent_eartag", index="animalid"
)

# Returns maximum number of male and female parents for that dataset
n_mothers = len(expanded_df.filter(like="f").columns)
n_fathers = len(expanded_df.filter(like="m").columns)

expanded_df = expanded_df.reset_index().rename_axis(None, axis=1)
expanded_df = expanded_df.rename(columns={"f": "Mother", "m": "Father"})

# Fetch mutation info for all parents and merge
for parent in ["Mother", "Father"]:
if parent in expanded_df:
parent_df = _get_mutations_for_parent(expanded_df, parent)
expanded_df = expanded_df.merge(parent_df, on=parent, how="left")
else:
_add_empty_parent_cols(expanded_df, parent)
# expanded_df = _rename_and_merge_parent_columns(
# "Mother", n_mothers, expanded_df
# )
# expanded_df = _rename_and_merge_parent_columns(
# "Father", n_fathers, expanded_df
# )

expanded_df = _fetch_and_merge_parent_mutations(
"Mother", n_mothers, expanded_df
)
expanded_df = _fetch_and_merge_parent_mutations(
"Father", n_fathers, expanded_df
)

# merge into the original animals_df, so animalids are in the same order,
# and any animals with no listed parents appear with NaN in the correct
# slots
merged_df = animals_df.loc[:, ["animalid"]]
merged_df = merged_df.merge(expanded_df, on="animalid", how="left")

return expanded_df
return merged_df


def _get_mutations_for_parent(
parents_df: pd.DataFrame, parent: str
def _fetch_and_merge_parent_mutations(
parent: str, n_parents: int, expanded_df: pd.DataFrame
) -> pd.DataFrame:
"""Return a dataframe with mutations for all unique parent IDs.
"""Groups parents by sex and fetches mutations for them all at once.
The results are then re-assigned to each individual parent and merged
into a single parent column.

Parameters
----------
parents_df : pd.DataFrame
Dataframe with animalid and 'parent' column
parent : str
Name of column of parent ids
"Mother" or "Father"
n_parent : int
Number of parents of this sex present in the dataset
expanded_df : pd.DataFrame
Dataframe with animalid and parent key
columns (f0, f1, ... or m0, m1, ...)

Returns
-------
pd.DataFrame
Dataframe with parent IDs and mutation / grade columns
expanded_df with renamed parent ID columns
and their mutation / grade columns
"""

mutations_df = _get_mutations_for_eartags(
parents_df[parent].dropna().unique().tolist()
if n_parents == 0:
_add_empty_parent_cols(expanded_df, parent)
return expanded_df

column_names, nparents_keys = _create_columns_and_keys_for_n_parents(
parent, n_parents
)

# uses nparents_keys as a dict key to be replaced with column names
expanded_df = expanded_df.rename(
columns=dict(zip(nparents_keys, column_names))
)
mutations_df = _expand_mutations_data(
mutations_df, column_prefix=f"{parent}: "

# for each parent in column names, retrieves all unique eartag ID into list
all_eartags = (
pd.concat([expanded_df[parent_id] for parent_id in column_names])
.dropna()
.unique()
.tolist()
)

mutations_df = _get_mutations_for_eartags(all_eartags)
mutations_df = _expand_mutations_data(mutations_df)

# Split results by parent column where parent_eartags match,
# add parentx as a prefix to the column and merge

expanded_df = assign_parent_mutations(
parent, column_names, mutations_df, expanded_df
)
mutations_df = mutations_df.drop(["animalid"], axis=1)
mutations_df = mutations_df.rename(columns={"eartag_or_id": parent})

return mutations_df
return expanded_df


def _create_columns_and_keys_for_n_parents(parent: str, n_parents: int):
"""Generates and maps the parent keys produced by cumcount
to readable column names.
"""

parent_sex = parent
nparents_keys = []
column_names = []

# Change all 'parent'_keys at once
for i in range(n_parents):
if parent_sex == "Father":
parent_key = f"m{i}"
elif parent_sex == "Mother":
parent_key = f"f{i}"
else:
raise Exception(f"Expected:'m' or 'f' - Received: {parent_sex}")
nparents_keys.append(parent_key)

if i == 0:
column_names.append(parent)
else:
column_names.append(f"{parent}{i + 1}")

return column_names, nparents_keys


def assign_parent_mutations(
parent: str,
column_names: list[str],
mutations_df: pd.DataFrame,
expanded_df: pd.DataFrame,
) -> pd.DataFrame:
"""Split a combined mutations dataframe by parent column and merges back.

For each parent column, filters by eartags present in that column,
adds a readable prefixes to every mutation and grade column, then merges
the result into expanded_df.

Parameters
----------
parent : str
Display name for the parent sex.
column_names : list[str]
Ordered list of column names in expanded_df
mutations_df : pd.DataFrame
Combined mutations dataframe for all parents of this sex
expanded_df : pd.DataFrame
Dataframe with one row per animal,
containing readable parent ID columns.

Returns
-------
pd.DataFrame
expanded_df with additional prefixed mutation and grade columns
merged in for each parent
"""
for parent in column_names:
parent_eartags = expanded_df[parent].dropna().unique().tolist()
parent_mutations = mutations_df[
mutations_df["eartag_or_id"].isin(parent_eartags)
].copy()

parent_mutations = parent_mutations.drop(columns=["animalid"])

parent_mutations = parent_mutations.rename(
columns={"eartag_or_id": parent}
)

new_column_names = []
for p_mutation_column in parent_mutations.columns:
if p_mutation_column != parent:
new_column_names.append(f"{parent}: {p_mutation_column}")
else:
new_column_names.append(p_mutation_column)
parent_mutations.columns = new_column_names

expanded_df = expanded_df.merge(
parent_mutations, on=parent, how="left"
)

return expanded_df
26 changes: 15 additions & 11 deletions oscar/colony_management/pyrat/standardise.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ def standardise_pyrat_csv(
all_mutation_cols_list = sum(mutation_cols.values(), [])
all_genotype_cols_list = sum(genotype_cols.values(), [])

n_mothers = len(input_csv.filter(like="f").columns)
n_fathers = len(input_csv.filter(like="m").columns)

required_cols = (
[
"ID",
Expand All @@ -62,17 +65,18 @@ def standardise_pyrat_csv(
)

# Get rid of any additional columns + rename to standard names
standard_csv = input_csv[required_cols]
standard_csv = standard_csv.rename(
columns={
"ID": "ID_offspring",
"Line / Strain (Name)": "line_name",
"DOB": "date_of_birth",
"Father": "ID_father",
"Mother": "ID_mother",
"Sacrifice reason": "sacrifice_reason",
}
)
if n_mothers and n_fathers == 1:
standard_csv = input_csv[required_cols]
standard_csv = standard_csv.rename(
columns={
"ID": "ID_offspring",
"Line / Strain (Name)": "line_name",
"DOB": "date_of_birth",
"Father": "ID_father",
"Mother": "ID_mother",
"Sacrifice reason": "sacrifice_reason",
}
)

standard_csv = _filter_or_correct_genotypes(
standard_csv, all_genotype_cols_list
Expand Down
10 changes: 10 additions & 0 deletions tests/pooch_registry.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,18 @@ pyrat-api-multiple-responses.csv 703b598b114c4101295fe2270b243e2e67bf4001d07a4c6
pyrat-api-multiple-responses-father.json 5ab35af0554357d350aedb131d2d32ba5b3bac70da746de937081ef3c27e17e9
pyrat-api-multiple-responses-mother.json 55a4354dd344e1312edbd7e31ac2f5f362448b7e5bc6477115504db28b3e2c40
pyrat-api-multiple-responses-offspring.json 37a7e7e28c75b12a2bc72e84ee963d3370bdadb1dc6235c3381493cf3de945c1
pyrat-api-multiple-response-and-parents-father.json 64ac40ca8fadf337682be9062aaa1cd87091d58b51eb112c2e9fa12802975126
pyrat-api-multiple-response-and-parents-mother.json 9032b519fa0cdf5fef4c47c23823256a34904e7c0be86bdf0661ccc47ff47dc6
pyrat-api-multiple-response-and-parents-offspring.json 69c017f007bee8bbc143a5e1c75e8c1ad338f2ebba8734a4d053b18b760fa98c
pyrat-api-multiple-response-multiple-parents.csv 86ffb23ebe00aa5bd9c2791cf37520bc987da3503095e896c6ac12d634e2c61e
pyrat-api-no-parents-mutations.csv 1f97ace7628509a33b391f31f4fe44be99315a846b219cb708ddb44e6a1e8693
pyrat-api-no-parents-mutations-offspring.json b3a81462048ed4ec1921e6e51e37523262171c8722d6c8cf6e80470183917318
pyrat-api-single-parent.csv 4467a5bd008edbf1d58a8ac7898f8809bdb3a2470cd992b919513c870f70cdd6
pyrat-api-single-parent-offspring.json 3e956a9ebf9aa07942e82a8ff7ef0833442d3b4f41ac7511dd72b5273dc5bb31
pyrat-api-single-parent-mother.json cffb747c5b09a73b37c13e281b16c7292e5577cf02e0c0b35588f5c5640d9fd9
pyrat-data-multiple-parents-1-mutation.csv 8f0ddb19236b013d8abcacf75ba40bb1201afa041899ef1c67e5df309d0e162c
pyrat-data-multiple-parents-2-mutations.csv 972716a431da6a142394c03e843e0eece138488ef93e7c9c4ea18ee083f17f8b
pyrat-data-multiple-parents-3-mutations.csv cae60f38e7617f406fdaf42e53619f5278fa712583e740e6840907c2a9e6f51b
standardised-data-multiple-parents-1-mutation.csv 9c9820495a4c78a66cfb17e62d26c0a26a998427205137f771857b43a78304aa
standardised-data-multiple-parents-2-mutations.csv b2de8d782f427c62d898e6d1c294a9d2c504b525b90fce654e3f3eee185a31d3
standardised-data-multiple-parents-3-mutations.csv b165317104f434541b4839a83f625b340987a43c7f75aca848f9cdd9e1dbae57
2 changes: 1 addition & 1 deletion tests/pooch_test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

GIN_REPO = pooch.create(
path=Path(__file__).parents[1] / "test_data",
base_url="https://gin.g-node.org/neuroinformatics/oscar-test-data/raw/master/",
base_url="https://gin.g-node.org/neuroinformatics/oscar-test-data/raw/hc/more-than-two-parents/",
registry=None,
retry_if_failed=5,
)
Expand Down
24 changes: 23 additions & 1 deletion tests/test_unit/pyrat/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,27 @@ def create_animal_response(
"pyrat-api-multiple-responses.csv",
id="Multiple items returned",
),
pytest.param(
create_animal_response(
json_filename=(
"pyrat-api-multiple-response-and-parents-father.json"
),
query_params={"eartag": ["ID-100", "ID-102"]},
),
create_animal_response(
json_filename=(
"pyrat-api-multiple-response-and-parents-mother.json"
),
query_params={"eartag": ["ID-101", "ID-103", "ID-105"]},
),
create_animal_response(
json_filename=(
"pyrat-api-multiple-response-and-parents-offspring.json"
)
),
"pyrat-api-multiple-response-multiple-parents.csv",
id="Multiple items with multiple parents returned",
),
pytest.param(
None,
create_animal_response(
Expand Down Expand Up @@ -142,9 +163,10 @@ def test_get_pyrat_data(

# add mock responses
responses.add(species_response)
for response in [father_response, mother_response, offspring_response]:
for response in [father_response, mother_response]:
if response is not None:
responses.add(response)
responses.add(offspring_response)

pyrat_dfs = get_pyrat_data(
species_name="Mouse",
Expand Down
Loading
Loading