bump version from 0.2.13 -> 0.3.0

abearab · web-flow · commit a2bad582e986 · 2024-05-11T17:20:16.000-07:00
diff --git a/README.md b/README.md
@@ -8,8 +8,8 @@
 
 ## TL;DR
 
-[ReadTheDocs](https://screenpro2.readthedocs.io) |
-[PyPI](https://pypi.org/project/ScreenPro2)
+[**ReadTheDocs**](https://screenpro2.readthedocs.io) |
+[**PyPI**](https://pypi.org/project/ScreenPro2)
 
 ScreenPro2 enables perform flexible analysis on high-content CRISPR screening datasets. It has functionalities to process data from diverse CRISPR screen platforms and is designed to be modular to enable easy extension to custom CRISPR screen platforms or other commonly used platforms in addition to the ones currently implemented.
 
@@ -39,6 +39,12 @@ pip install git+https://github.com/ArcInstitute/ScreenPro2.git
 ```
 
 ## Usage
+First, import the ScreenPro2 package:
+
+```python
+import screenpro as scp
+```
+
 Data analysis for CRISPR screens with NGS readouts can be broken down into three main steps:
 
 - [Step 1: FASTQ to counts](#step-1-fastq-to-counts)
@@ -47,11 +53,70 @@ Data analysis for CRISPR screens with NGS readouts can be broken down into three
 
 ### Step 1: FASTQ to counts
 
-Since version 0.2.7, ScreenPro2 has a built-in method to process FASTQ files and generate counts. This method is implemented in the `ngs` module 
-and relvent submodules. A minor novelty here has enabled processing single, dual, or multiple sgRNA CRISPR screens. Also, this approach can retain 
-recombination events which can occur in dual or higher order sgRNA CRISPR screens.
+ScreenPro2 has a built-in method to process FASTQ files and generate counts. 
+This method is implemented in the `ngs` module and relvent submodules. 
+A minor novelty here has enabled processing single, dual, or multiple sgRNA 
+CRISPR screens. Also, this approach can retain recombination events which can
+occur in dual or higher order sgRNA CRISPR screens.
+
+Currently, `Counter` class from the `ngs` module can process FASTQ files and generate counts for standard 
+CRISPR screens with [single](#dcas9-crisprai-single-sgrna-screens) or [dual](#crispri-dual-sgrna-screens) 
+guide design. 
+
+Here is a draft code to process FASTQ files and generate counts for an experiment with [CRISPRi-dual-sgRNA-screens](#crispri-dual-sgrna-screens):
+
+```python
+# Initialize the Counter object
+counter = scp.Counter(cas_type = 'cas9', library_type = 'single_guide_design')
+
+# Load the reference library
+counter.load_library("<path-to-CRISPR-library-table>", sep = '\t', verbose = True, index_col=None)
+
+# Define the samples
+samples = [] 
+## `samples` is a list of sample ids in the experiment. 
+## Each sample id should match the sample name in the FASTQ files, i.e. <sample_id>.fastq.gz
+
+# Process the FASTQ files and generate counts
+counter.get_counts_matrix(
+    fastq_dir = '<path-to-fastq-directory>',
+    samples = samples,
+    verbose = True
+)
+```
+
+Here is a draft code to process FASTQ files and generate counts for an experiment with [CRISPRi-dual-sgRNA-screens](#crispri-dual-sgrna-screens):
 
-There is no example code for this step yet, but a command line interface (CLI) will be available soon. 
+
+```python
+# Initialize the Counter object
+counter = scp.Counter(cas_type = 'dCas9', library_type = 'dual_guide_design')
+
+# Load the reference library
+counter.load_library("<path-to-CRISPR-library-table>", sep = '\t', verbose = True, index_col=None)
+
+# Define the samples
+samples = []
+## `samples` is a list of sample ids in the experiment.
+## Each sample id should match the sample name in the FASTQ files, i.e. <sample_id>_R[1,2].fastq.gz
+
+# Process the FASTQ files and generate counts
+counter.get_counts_matrix(
+    fastq_dir = '<path-to-fastq-directory>',
+    samples = samples,
+    verbose = True
+)
+```
+
+After this, you have `.counts_mat` calculated in the `Counter` object.
+
+___
+
+To proceed, you need to create an `AnnData` object from the counts matrix and metadata. You can use the following code to create an `AnnData` object:
+
+```python
+adata = counter.build_counts_anndata()
+```
 
 ### Step 2: Phenotype calculation
 
diff --git a/docs/source/history.rst b/docs/source/history.rst
@@ -2,14 +2,15 @@
 History
 =======
 
-0.3.0 (coming soon)
+0.4.0 (coming soon)
 ~~~~~~~~~~~~~~~~~~~
 * add command line interface
 
-0.2.11 (May 2024)
+0.2.11 - 0.3.0 (Apr 2024 - May 2024)
 ~~~~~~~~~~~~~~~~~
-* introduce `counter` module
+* introduce `Counter` class as wrapper for `ngs` module
 * improve core functionalities for CLI
+* major bug fixes
 
 0.2.7 - 0.2.10 (Mar 2024 - Apr 2024)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/screenpro/__init__.py b/screenpro/__init__.py
@@ -6,6 +6,6 @@
 from .ngs import Counter
 from .assays import PooledScreens, GImaps
 
-__version__ = "0.2.13"
+__version__ = "0.3.0"
 __author__ = "Abe Arab"
 __email__ = 'abea@arcinstitute.org' # "abarbiology@gmail.com"
diff --git a/screenpro/load.py b/screenpro/load.py
@@ -8,27 +8,54 @@
 from .utils import check_protospacer_length, trim_protospacer
 
 
-def load_cas9_sgRNA_library(library_path, library_type, sep='\t', index_col=0, protospacer_length=19, verbose=True):
+def load_cas9_sgRNA_library(library_path, library_type, sep='\t', index_col=0, protospacer_length=19, verbose=True, **args):
     '''Load Cas9 sgRNA library table for single or dual guide design.
     '''
     library = pd.read_csv(
         library_path,
         sep=sep,
         index_col=index_col,
+        **args
     )
 
     ## Evaluate library table and reformat columns for downstream analysis
     # I would like to name the target column 'target' if it is named 'gene'!
-    if 'gene' in library.columns:
-        # rename gene column to target
-        library = library.rename(columns={'gene': 'target'})
     
     if library_type == "single_guide_design":
-        eval_columns = ['target', 'sgID', 'protospacer']
+        eval_columns = ['target', 'sgID', 'protospacer', 'sequence']
+
+        # reformating columns as needed
+        if 'gene' in library.columns:
+            # rename gene column to target
+            library = library.rename(columns={'gene': 'target'})
+        if 'sequence' in library.columns and 'protospacer' not in library.columns:
+            library.rename(columns={'sequence': 'protospacer'}, inplace=True)
+        if 'sgId' in library.columns:
+            library.rename(columns={'sgId': 'sgID'}, inplace=True)
 
         # Upper case protospacer sequences
         library['protospacer'] = library['protospacer'].str.upper()
 
+        protospacer_col = 'protospacer'
+        in_length = check_protospacer_length(library, 'protospacer')
+        if in_length == protospacer_length:
+            pass
+        elif in_length > protospacer_length:
+            if verbose: print(f"Trimming protospacer sequences in '{protospacer_col}' column.")
+            library = trim_protospacer(
+                library, protospacer_col, 
+                '5prime', 
+                in_length - protospacer_length
+            )
+
+        elif in_length < protospacer_length:
+            raise ValueError(
+                f"Input protospacer length for '{protospacer_col}' is less than {protospacer_length}"
+            )
+        
+        # write `sequence` column as `protospacer` (after trimming)
+        library['sequence'] = library['protospacer']
+
         for col in eval_columns:
             if col not in library.columns:
                 raise ValueError(f"Column '{col}' not found in library table.")
@@ -43,6 +70,11 @@ def load_cas9_sgRNA_library(library_path, library_type, sep='\t', index_col=0, p
             'sequence'
         ]
         
+        # reformating columns as needed
+        if 'gene' in library.columns:
+            # rename gene column to target
+            library = library.rename(columns={'gene': 'target'})
+
         # Upper case protospacer sequences
         library['protospacer_A'] = library['protospacer_A'].str.upper()
         library['protospacer_B'] = library['protospacer_B'].str.upper()
@@ -62,10 +94,10 @@ def load_cas9_sgRNA_library(library_path, library_type, sep='\t', index_col=0, p
 
             elif in_length < protospacer_length:
                 raise ValueError(
-                    f"Input protospacer length for '{protospacer_col}'is less than {protospacer_length}"
+                    f"Input protospacer length for '{protospacer_col}' is less than {protospacer_length}"
                 )
     
-        # if 'sequence' not in library.columns:
+        # write `sequence` column as `protospacer_A;protospacer_B` (after trimming)
         library['sequence'] = library['protospacer_A'] + ';' + library['protospacer_B']
 
         for col in eval_columns:
@@ -74,6 +106,9 @@ def load_cas9_sgRNA_library(library_path, library_type, sep='\t', index_col=0, p
 
         library = library[eval_columns]
 
+    else:
+        raise ValueError(f"Invalid library type: {library_type}. Please choose 'single_guide_design' or 'dual_guide_design'.")
+    
     if verbose: print("Library table successfully loaded.")
 
     return library
diff --git a/screenpro/ngs/cas9.py b/screenpro/ngs/cas9.py
@@ -16,15 +16,15 @@ def fastq_to_count_single_guide(
     
     if trim5p_start and trim5p_length:
         sql_cmd = f"""
-        SELECT substr(f.sequence, {trim5p_start}, {trim5p_length}) AS sequence, COUNT(*) as count
+        SELECT substr(f.sequence, {trim5p_start}, {trim5p_length}) AS protospacer, COUNT(*) as count
         FROM fastq_scan('{fastq_file_path}') f
-        GROUP BY sequence
+        GROUP BY protospacer
         """
     else:
         sql_cmd = f"""
-        SELECT f.sequence AS sequence, COUNT(*) as count
+        SELECT f.sequence AS protospacer, COUNT(*) as count
         FROM fastq_scan('{fastq_file_path}') f
-        GROUP BY sequence
+        GROUP BY protospacer
         """
     
     df_count = session.sql(sql_cmd).to_polars()
@@ -91,13 +91,14 @@ def map_to_library_single_guide(df_count, library, return_type='all', verbose=Fa
     # get counts for given input
     res = df_count.clone() #cheap deepcopy/clone
     res = res.sort('count', descending=True)
+
     res = res.with_columns(
-        pl.col("sequence").alias("sequence"),
+        pl.col("protospacer").alias("sequence"),
     )
 
     res_map = pl.DataFrame(library).join(
-            res, on="sequence", how="left"
-        )
+        res, on="sequence", how="left"
+    )
 
     if return_type == 'unmapped' or return_type == 'all':
         res_unmap = res.join(
diff --git a/screenpro/ngs/counter.py b/screenpro/ngs/counter.py
diff --git a/screenpro/phenoscore.py b/screenpro/phenoscore.py