pangeo-forge · yuvipanda · Feb 18, 2023 · Feb 22, 2023 · Feb 22, 2023 · Feb 22, 2023
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -0,0 +1,87 @@
+name: Run recipe with 2 time steps
+
+on:
+  - pull_request
+
+jobs:
+  run-tests:
+
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python 3.9
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.9
+          cache: pip
+
+      - name: Cache inputs
+        uses: actions/cache@v3
+        with:
+          path: storage/cache
+          # We don't really need a varying key here, as pangeo-forge-recipes manages this cache
+          key: cache-1
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install pytest pytest-github-report
+
+          pip install git+https://github.com/pangeo-forge/pangeo-forge-runner@local-testing
+          pip install -r feedstock/requirements.txt
+
+      - name: Generate name of the test
+        run: |
+          JOB_NAME="test-$(date '+%F-%H-%M-%S')"
+          ZARR_STORE_NAME="gpm-3imergdl"
+          echo "JOB_NAME=${JOB_NAME}" >> $GITHUB_ENV
+          echo "ZARR_STORE_NAME=${ZARR_STORE_NAME}" >> $GITHUB_ENV
+          echo "ZARR_STORE_PATH=storage/output/${JOB_NAME}/${ZARR_STORE_NAME}" >> $GITHUB_ENV
+
+      - name: Test with pytest
+        env:
+          EARTHDATA_LOGIN_TOKEN: ${{ secrets.EARTHDATA_LOGIN_TOKEN }}
+          JOB_NAME: ${{ env.JOB_NAME }}
+          ZARR_STORE_NAME: ${{ env.ZARR_STORE_NAME }}
+          pytest_github_report: true
+        run: |
+          pytest -s tests/
+
+      - name: Write xarray output to summary
+        shell: python -u {0}
+        run: |
+          import xarray as xr
+          import os
+
+          step_summary_file = os.environ["GITHUB_STEP_SUMMARY"]
+
+          ds = xr.open_dataset("${{ env.ZARR_STORE_PATH }}", engine="zarr")
+
+          print(str(ds))
+
+          with open(step_summary_file, mode="a") as f:
+            f.write("\n")
+            f.write("## Produced dataset summary\n")
+            f.write("```\n")
+            f.write(str(ds))
+            f.write("\n")
+            f.write("```\n")
+
+      - name: Write tree structure to summary
+        run: |
+          echo '## Zarr store tree structure' >> $GITHUB_STEP_SUMMARY
+          # Must be single quotes here, or bash is unhappy
+          echo '```' >> $GITHUB_STEP_SUMMARY
+          tree --du -h ${{ env.ZARR_STORE_PATH }} >> $GITHUB_STEP_SUMMARY
+          echo '```' >> $GITHUB_STEP_SUMMARY
+
+      - name: Write logs to summary
+        run: |
+          set -x
+          echo '## Logs' >> $GITHUB_STEP_SUMMARY
+
+          echo 'Logs produced during the *parsing* of your recipe file' >> $GITHUB_STEP_SUMMARY
+          echo '<details><summary>INFO level log</summary>' >> $GITHUB_STEP_SUMMARY
+          echo '<pre>' >> $GITHUB_STEP_SUMMARY
+          cat ${JOB_NAME}_parse.info.log >> $GITHUB_STEP_SUMMARY
+          echo '</pre></details>' >> $GITHUB_STEP_SUMMARY
diff --git a/feedstock/recipe.py b/feedstock/recipe.py
@@ -1,66 +1,26 @@
 """
 A recipe to move GPM_#IMERGDL from <DC> to a cloud analysis ready format.
 """
-import netrc
 
-import aiohttp
-import numpy as np
-from cmr import GranuleQuery
+import apache_beam as beam
+from pangeo_forge_cmr import files_from_cmr
+from pangeo_forge_earthdatalogin import OpenURLWithEarthDataLogin
+from pangeo_forge_recipes.transforms import OpenWithXarray, StoreToZarr
 
-from pangeo_forge_recipes.patterns import pattern_from_file_sequence
-from pangeo_forge_recipes.recipes import XarrayZarrRecipe
-
-collection_shortname = ['GPM_3IMERGDL']
-
-
-# Get a list of granules for this collection from CMR
-# Each Granule is a file, provided to us as a HTTPS URL
-api_granule = GranuleQuery()
-api_granule.parameters(
-    short_name=collection_shortname,
+files = files_from_cmr(
+    shortname='GPM_3IMERGDL',
+    concat_dim="time",
+    nitems_per_file=1
 )
-# We use print statements to provide debug output as we go laong
-print(f'number of granules: {api_granule.hits()}')
-api_granule_downloadable = api_granule.downloadable()
-print(f'number of downloadable granules: f{api_granule_downloadable.downloadable().hits()}')
-
-# retrieve all the granules
-granules = api_granule.get_all()
 
-# Find list of all downloadable URLs for the granules
-url_list = []
-# FIXME: Remove numpy use?
-for i in range(0, np.shape(granules)[0]):
-    for element in granules[i]['links']:
-        if element['rel'] == 'http://esipfed.org/ns/fedsearch/1.1/data#':
-            print('adding url: ' + element['href'])
-            url_list.append(element['href'])
-            break
-    else:
-        # FIXME: Provide useful info here
-        print('no downloadable url found')
-
-# We need to provide EarthData credentials to fetch the files.
-# The credentials of the currently logged in user are used, and passed on to the cloud
-# as well when the operation is scaled out. This shall be automated with a machine identity
-# in the future.
-# go here to set up .netrc file: https://disc.gsfc.nasa.gov/data-access
-username, _, password = netrc.netrc().authenticators('urs.earthdata.nasa.gov')
-client_kwargs = {
-    'auth': aiohttp.BasicAuth(username, password),
-    'trust_env': True,
-}
-
-# Now we create the Pangeo Forge Recipe!
-# The output will be a cloud analysis ready Zarr archive, created with xarray
-recipe = XarrayZarrRecipe(
-    pattern_from_file_sequence(  # The pattern of input files to check
-        url_list,  # List of URLs pointing to our input files, fetched earlier.
-        concat_dim='time',  # TODO: What does this do?
-        nitems_per_file=1,  # TODO: What does this do?
-        fsspec_open_kwargs=dict(
-            client_kwargs=client_kwargs
-        ),  # Pass our earthdata credentials through to FSSpec, so we can authenticate & fetch data
-    ),
-    inputs_per_chunk=10,  # TODO: What does this do?
+recipe = (
+    beam.Create(files.items())
+    | OpenURLWithEarthDataLogin()
+    | OpenWithXarray()
+    | StoreToZarr(
+        store_name='gpm-3imergdl',
+        target_chunks={'time': 2},
+        combine_dims=files.combine_dim_keys,
+    )
 )
+
diff --git a/feedstock/requirements.txt b/feedstock/requirements.txt
@@ -0,0 +1,3 @@
+git+https://github.com/yuvipanda/pangeo-forge-cmr@logging
+git+https://github.com/yuvipanda/pangeo-forge-earthdatalogin@main
+git+https://github.com/yuvipanda/pangeo-forge-recipes@subpath
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,105 @@
+import pytest
+import os
+import pathlib
+import xarray
+import copy
+import subprocess
+import json
+import tempfile
+import time
+
+ROOT_DIR = pathlib.Path(__file__).parent.parent
+ZARR_STORE_NAME = os.environ.get("ZARR_STORE_NAME", "gpm-3imergdl")
+
+
+
+@pytest.fixture(scope="module")
+def zarr_store_root() -> str:
+    job_name = os.environ.get("JOB_NAME", f"test-{str(int(time.time()))}")
+
+    config = {
+        "BaseCommand": {
+            "logging_config": {
+                "version": 1,
+                "formatters": {
+                    "simple": {
+                        "format": "%(asctime)s %(levelname)s %(message)s"
+                    }
+                },
+                "handlers": {
+                    "recipe_parse_debug": {
+                        "class": "logging.FileHandler",
+                        "level": "DEBUG",
+                        "formatter": "simple",
+                        "filename": f"{job_name}_parse.debug.log"
+                    },
+                    "recipe_parse_info": {
+                        "class": "logging.FileHandler",
+                        "level": "INFO",
+                        "formatter": "simple",
+                        "filename": f"{job_name}_parse.info.log"
+                    },
+                    "beam_info_log": {
+                        "class": "logging.FileHandler",
+                        "level": "INFO",
+                        "formatter": "simple",
+                        "filename": f"{job_name}_beam.info.log"
+                    },
+                    "beam_debug_log": {
+                        "class": "logging.FileHandler",
+                        "level": "DEBUG",
+                        "formatter": "simple",
+                        "filename": f"{job_name}_beam.debug.log"
+                    }
+                },
+                "loggers": {
+                    "pangeo_forge_recipes.parse": {
+                        "level": "DEBUG",
+                        "handlers": ["recipe_parse_debug", "recipe_parse_info"]
+                    },
+                    "apache_beam": {
+                        "level": "DEBUG",
+                        "handlers": ["beam_info_log", "beam_debug_log"]
+                    }
+                }
+            }
+        },
+        "TargetStorage": {
+            "fsspec_class": "fsspec.implementations.local.LocalFileSystem",
+            "root_path": f"file://{ROOT_DIR}/storage/output/{job_name}"
+        },
+        "InputCacheStorage": {
+            "fsspec_class": "fsspec.implementations.local.LocalFileSystem",
+            "root_path": f"file://{ROOT_DIR}/storage/cache"
+        },
+        "Bake": {
+            "bakery_class":  "pangeo_forge_runner.bakery.local.LocalDirectBakery",
+            "job_name": job_name
+        }
+    }
+    with tempfile.NamedTemporaryFile(suffix=".json") as f:
+        print(config)
+        f.write(json.dumps(config).encode())
+        f.flush()
+        subprocess.check_call([
+            "pangeo-forge-runner",
+            "bake",
+            "--prune",
+            "-f",
+            f.name,
+            "--repo",
+            ROOT_DIR
+        ])
+
+        return config["TargetStorage"]["root_path"]
+
+
+
+# Not module scoped, as we want to provide a new array for each test
+@pytest.fixture
+def zarr_store(zarr_store_root: str) -> xarray.Dataset:
+    return xarray.open_dataset(
+        os.path.join(zarr_store_root, ZARR_STORE_NAME),
+        engine="zarr"
+    )
+
diff --git a/tests/test_gpm.py b/tests/test_gpm.py
@@ -0,0 +1,26 @@
+import xarray
+
+
+def test_doi(zarr_store: xarray.Dataset):
+    """
+    Verify the DOI attribute is set correctly
+    """
+    assert zarr_store.attrs["DOI"] == "10.5067/GPM/IMERGDL/DAY/06"
+
+
+def test_coverage(zarr_store: xarray.Dataset):
+    """
+    Verify that we have all the lats & lons
+    """
+    assert len(zarr_store.lat) == 1800
+    assert len(zarr_store.lon) == 3600
+
+
+def test_variables(zarr_store: xarray.Dataset):
+    """
+    Verify that the variables we care about are in here
+    """
+    expected_vars = ['HQprecipitation', 'HQprecipitation_cnt', 'HQprecipitation_cnt_cond', 'lat', 'lon', 'precipitationCal', 'precipitationCal_cnt', 'precipitationCal_cnt_cond', 'randomError', 'randomError_cnt', 'time', 'time_bnds']
+    for var in expected_vars:
+        assert var in list(zarr_store.variables)
+