Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions oscar/breeding_scheme.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ def from_string(cls, genotype_str: str) -> tuple[Self, ...]:
tuple[Self, ...]
Converted tuple of genotypes
"""
if not isinstance(genotype_str, str):
raise TypeError(
f"expected type - string, actual type - {type(genotype_str)}"
)

genotype_strings = genotype_str.split("_")
genotypes = [
cls[genotype_string.upper()]
Expand Down
43 changes: 42 additions & 1 deletion oscar/colony_management/pyrat/standardise.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import pandas as pd

from oscar.breeding_scheme import Genotype
from oscar.breeding_scheme import BreedingScheme, Genotype


class Identifier(Enum):
Expand Down Expand Up @@ -94,6 +94,9 @@ def standardise_pyrat_csv(
id_offspring_col = standard_csv.pop("ID_offspring")
standard_csv.insert(0, "ID_offspring", id_offspring_col)

to_ignore = standard_csv.apply(_remove_impossible_breeding_schemes, axis=1)
standard_csv = standard_csv[~to_ignore]
Comment on lines +97 to +98

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mostly personal preference, but maybe change the variable name here to make it super clear what is being ignored.

Suggested change
to_ignore = standard_csv.apply(_remove_impossible_breeding_schemes, axis=1)
standard_csv = standard_csv[~to_ignore]
impossible_breeding_schemes = standard_csv.apply(_is_impossible_breeding_scheme, axis=1)
standard_csv = standard_csv[~impossible_breeding_schemes]


return standard_csv


Expand Down Expand Up @@ -384,3 +387,41 @@ def _make_combined_genotype_column_for_identifier(
line_data.loc[genotyped_rows, new_col_name] = pivoted_mutations.loc[
genotyped_rows, unique_mutations
].agg("_".join, axis=1)


def _remove_impossible_breeding_schemes(

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As this function doesn't directly remove the impossible breeding schemes, but rather identifies them with True / False. I'd maybe re-name this? Something like:

Suggested change
def _remove_impossible_breeding_schemes(
def _is_impossible_breeding_scheme(

standardised_df_row: pd.Series,
) -> bool:
"""Retrieves parent genotypes and pulls the mendalian ratios from
BreedingScheme.
Compares offspring to these ratios, removing those which are not possible.
e.g. hom x hom parents cannot produce wt x wt offspring.
Comment on lines +394 to +398

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe modify this so it's clear that this particular function isn't removing them?

Suggested change
) -> bool:
"""Retrieves parent genotypes and pulls the mendalian ratios from
BreedingScheme.
Compares offspring to these ratios, removing those which are not possible.
e.g. hom x hom parents cannot produce wt x wt offspring.
) -> bool:
"""
Checks whether the given row contains an impossible breeding scheme.
Retrieves parent genotypes and pulls the mendalian ratios from
BreedingScheme. Compares offspring to these ratios, returning True for
those which are not possible.
e.g. hom x hom parents cannot produce wt offspring.


Parameters
----------
standardised_df_row : pd.Series
rom from standardised_dataframe (pd.DataFrame): standardised PyRAT df

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
rom from standardised_dataframe (pd.DataFrame): standardised PyRAT df
row from standardised_dataframe (pd.DataFrame): standardised PyRAT df


Returns
-------
bool
bool of whether or not that row contains an impossible breeding scheme
"""

pop = False

genotype_father = standardised_df_row["genotype_father"]
genotype_mother = standardised_df_row["genotype_mother"]
genotype_offspring = standardised_df_row["genotype_offspring"]

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To keep rows with un-genotyped offspring, you could add something like this here:

# If the offspring are un-genotyped, we keep the row as there is no way
# of checking if the breeding scheme was valid
if pd.isna(genotype_offspring):
    return pop

Hopefully this should make the test_standardise_genotypes test pass, although I see that there is one remaining impossible breeding scheme in the test file I missed. You can change: pyrat-data-forbidden-genotypes.csv / standardised-data-forbidden-genotypes.csv ID-011 genotype_mother to wt_het on your GIN branch.

typed_offspring = Genotype.from_string(genotype_offspring)

scheme = BreedingScheme(genotype_father, genotype_mother)
ratio = scheme.mendelian_ratio()

if typed_offspring not in ratio:
pop = True
elif ratio[typed_offspring] == 0:
pop = True

return pop
2 changes: 2 additions & 0 deletions tests/pooch_registry.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ standardised-data-2-mutations.csv 7bf5409814715013806602afbfc87f9a710f3ecf7aa864
pyrat-data-3-mutations.csv e6f95c47a3d4db6d1c7ec75e89de013f1b1e3331d8609fcd4be17c61ba21bece
standardised-data-3-mutations.csv f598f88afe11aa6bd99cddd6b30fa2a2ba36b473daa22ad46f9171bcad3a1d71
pyrat-data-forbidden-genotypes.csv 4fccb7fda7167d7e3253bad338e6fda03c5d8012362bbf7fc11d7ee4c0ec4487
pyrat-data-forbidden-schemes.csv c5c14c144b036eadcbea5c3ba5ce99ea5948fd51b27f4c6e3fbdd5f13ce4a7a5
standardised-data-forbidden-genotypes.csv 4859e1b97ec296a707f1d490464e4a8d18939ff1c26c9c9a604c7744580c791b
standardised-data-forbidden-schemes.csv d87d28762d4d2ab1bac1390e4c11927516fa881a3cef1bfe21e03540721072b7
pyrat-api-single-response.csv 14314c56a153d048c0c86d2253ce8c85a965fcbb2e9d2a8f37f4d5a58793b287
pyrat-api-single-response-father.json 9bab0bd1c9810841dcf76e83f788f6ee61431211643252eb3fe4c6e0a365483e
pyrat-api-single-response-mother.json cffb747c5b09a73b37c13e281b16c7292e5577cf02e0c0b35588f5c5640d9fd9
Expand Down
2 changes: 1 addition & 1 deletion tests/pooch_test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

GIN_REPO = pooch.create(
path=Path(__file__).parents[1] / "test_data",
base_url="https://gin.g-node.org/neuroinformatics/oscar-test-data/raw/master/",
base_url="https://gin.g-node.org/neuroinformatics/oscar-test-data/raw/hc/add-forbidden-schemes",
registry=None,
retry_if_failed=5,
)
Expand Down
21 changes: 21 additions & 0 deletions tests/test_unit/pyrat/test_standardise.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,27 @@ def test_standardise_genotypes():
pooch_data_path("standardised-data-forbidden-genotypes.csv")
)

with pytest.raises(TypeError):

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By adding with pytest.raises(TypeError), this test is allowed to pass even though it throws an error during standardise_pyrat_csv and never reaches pd.testing.assert_frame_equal. This means it is no longer testing that genotypes are standardised correctly.

You'll need to remove the with pytest.raises(TypeError) and rather add a fix in _remove_impossible_breeding_schemes instead. Otherwise, at the moment, any ungenotyped individuals that enter standardise_pyrat_csv will cause the processing to stop early (we need these ungenotyped individuals later in the historical stats part - see this issue: #8)

standard_csv = standardise_pyrat_csv(pyrat_csv)
pd.testing.assert_frame_equal(
standard_csv.reset_index(drop=True),
expected_csv.reset_index(drop=True),
)


def test_remove_impossible_breeding_schemes():
"""
Test that impossible breeding schemes are removed from raw data.
(e.g. hom x hom parents cannot make wt offspring)
"""

pyrat_csv = pd.read_csv(
pooch_data_path("pyrat-data-forbidden-schemes.csv")
)
expected_csv = pd.read_csv(
pooch_data_path("standardised-data-forbidden-schemes.csv")
)

standard_csv = standardise_pyrat_csv(pyrat_csv)
pd.testing.assert_frame_equal(
standard_csv.reset_index(drop=True),
Expand Down
Loading