Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
b8285b8
Rewrite to work with beam-refactor
yuvipanda Feb 18, 2023
f9ae413
Move earthdatalogin code to its own package
yuvipanda Feb 22, 2023
6ce2eab
Make time chunks smaller
yuvipanda Feb 22, 2023
d7e4dac
Install pangeo-forge-earthdatalogin as well
yuvipanda Feb 22, 2023
0983037
Add tests!!!!
yuvipanda Feb 22, 2023
e5e9c01
Set earthdata login token for pytest
yuvipanda Feb 22, 2023
3fe2591
Run tests on PR too
yuvipanda Feb 22, 2023
bd9bba8
Try pull_request instead of pr
yuvipanda Feb 22, 2023
995c864
Fix typo
yuvipanda Feb 22, 2023
7092e95
Use an in-progress branch of pangeo-forge-runner
yuvipanda Feb 22, 2023
9a04236
Add github.qkg1.topment report on pytest failures
yuvipanda Feb 22, 2023
cfc99e7
Don't run tests twice
yuvipanda Feb 22, 2023
8bb395c
Use correct way to retrieve earthdata login token
yuvipanda Feb 22, 2023
4a83803
Add missing import
yuvipanda Feb 22, 2023
d1ef59a
Show output from test as it is run
yuvipanda Feb 22, 2023
3077250
Fix typo
yuvipanda Feb 22, 2023
73c413c
Use github cache to cache the pangeo-forge-runner cache
yuvipanda Feb 22, 2023
21be7f4
Rename tests to better reflect reality
yuvipanda Feb 22, 2023
e681eb5
Try to print xarray summary
yuvipanda Feb 23, 2023
76afad1
Cache pip dependencies
yuvipanda Feb 23, 2023
708686f
Fix typo
yuvipanda Feb 23, 2023
32d0b6b
Pretty up xarray output
yuvipanda Feb 23, 2023
28d5e04
Pretty up markdown output a little more
yuvipanda Feb 23, 2023
c0df5f5
Try writing out xarray's HTML repr in summary
yuvipanda Feb 23, 2023
8323dd2
Revert "Try writing out xarray's HTML repr in summary"
yuvipanda Feb 23, 2023
a226c6d
Output tree structure to Github summary
yuvipanda Feb 23, 2023
a12a023
Set ZARR_STORE_NAME as a bash variable too
yuvipanda Feb 23, 2023
9df23b7
Add sudo to apt calls
yuvipanda Feb 23, 2023
02b59d1
Fix yaml typo
yuvipanda Feb 23, 2023
d4c8c37
Don't install tree, it is already there
yuvipanda Feb 23, 2023
9bf8239
Show debug info for tree step
yuvipanda Feb 23, 2023
17a5a1c
Fix quotes to make bash happy
yuvipanda Feb 23, 2023
c3f7346
Bump up time chunk
yuvipanda Feb 23, 2023
2fa672e
Simplify use of pangeo_forge_cmr
yuvipanda Feb 23, 2023
42f392d
Install pangeo-forge-earthdatalogin from pypi
yuvipanda Feb 23, 2023
2535179
Provide logs in the summary too
yuvipanda Feb 25, 2023
bd6cac9
Bring in a WIP version of pangeo-forge-cmr
yuvipanda Feb 25, 2023
db2849d
Add a debug ls
yuvipanda Feb 25, 2023
758bd0d
Fix HTML structure + add a debugging cat
yuvipanda Feb 25, 2023
3dfe643
Fix name of HTML tag + appropriate redirect operator
yuvipanda Feb 25, 2023
798a73e
Fix ridiculous typo
yuvipanda Feb 25, 2023
81abc4d
Don't print apache beam debug log to summary
yuvipanda Feb 25, 2023
f4cf48c
Provide logs from earthdatalogin too
yuvipanda Feb 25, 2023
dc79799
Don't upload beam logs at all
yuvipanda Feb 25, 2023
0e6a30a
Only put INFO level log in summary
yuvipanda Feb 25, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 87 additions & 0 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
name: Run recipe with 2 time steps

on:
- pull_request

jobs:
run-tests:

runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python 3.9
uses: actions/setup-python@v4
with:
python-version: 3.9
cache: pip

- name: Cache inputs
uses: actions/cache@v3
with:
path: storage/cache
# We don't really need a varying key here, as pangeo-forge-recipes manages this cache
key: cache-1

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install pytest pytest-github-report

pip install git+https://github.com/pangeo-forge/pangeo-forge-runner@local-testing
pip install -r feedstock/requirements.txt

- name: Generate name of the test
run: |
JOB_NAME="test-$(date '+%F-%H-%M-%S')"
ZARR_STORE_NAME="gpm-3imergdl"
echo "JOB_NAME=${JOB_NAME}" >> $GITHUB_ENV
echo "ZARR_STORE_NAME=${ZARR_STORE_NAME}" >> $GITHUB_ENV
echo "ZARR_STORE_PATH=storage/output/${JOB_NAME}/${ZARR_STORE_NAME}" >> $GITHUB_ENV

- name: Test with pytest
env:
EARTHDATA_LOGIN_TOKEN: ${{ secrets.EARTHDATA_LOGIN_TOKEN }}
JOB_NAME: ${{ env.JOB_NAME }}
ZARR_STORE_NAME: ${{ env.ZARR_STORE_NAME }}
pytest_github_report: true
run: |
pytest -s tests/

- name: Write xarray output to summary
shell: python -u {0}
run: |
import xarray as xr
import os

step_summary_file = os.environ["GITHUB_STEP_SUMMARY"]

ds = xr.open_dataset("${{ env.ZARR_STORE_PATH }}", engine="zarr")

print(str(ds))

with open(step_summary_file, mode="a") as f:
f.write("\n")
f.write("## Produced dataset summary\n")
f.write("```\n")
f.write(str(ds))
f.write("\n")
f.write("```\n")

- name: Write tree structure to summary
run: |
echo '## Zarr store tree structure' >> $GITHUB_STEP_SUMMARY
# Must be single quotes here, or bash is unhappy
echo '```' >> $GITHUB_STEP_SUMMARY
tree --du -h ${{ env.ZARR_STORE_PATH }} >> $GITHUB_STEP_SUMMARY
echo '```' >> $GITHUB_STEP_SUMMARY

- name: Write logs to summary
run: |
set -x
echo '## Logs' >> $GITHUB_STEP_SUMMARY

echo 'Logs produced during the *parsing* of your recipe file' >> $GITHUB_STEP_SUMMARY
echo '<details><summary>INFO level log</summary>' >> $GITHUB_STEP_SUMMARY
echo '<pre>' >> $GITHUB_STEP_SUMMARY
cat ${JOB_NAME}_parse.info.log >> $GITHUB_STEP_SUMMARY
echo '</pre></details>' >> $GITHUB_STEP_SUMMARY
76 changes: 18 additions & 58 deletions feedstock/recipe.py
Original file line number Diff line number Diff line change
@@ -1,66 +1,26 @@
"""
A recipe to move GPM_#IMERGDL from <DC> to a cloud analysis ready format.
"""
import netrc

import aiohttp
import numpy as np
from cmr import GranuleQuery
import apache_beam as beam
from pangeo_forge_cmr import files_from_cmr
from pangeo_forge_earthdatalogin import OpenURLWithEarthDataLogin
from pangeo_forge_recipes.transforms import OpenWithXarray, StoreToZarr

from pangeo_forge_recipes.patterns import pattern_from_file_sequence
from pangeo_forge_recipes.recipes import XarrayZarrRecipe

collection_shortname = ['GPM_3IMERGDL']


# Get a list of granules for this collection from CMR
# Each Granule is a file, provided to us as a HTTPS URL
api_granule = GranuleQuery()
api_granule.parameters(
short_name=collection_shortname,
files = files_from_cmr(
shortname='GPM_3IMERGDL',
concat_dim="time",
nitems_per_file=1
)
# We use print statements to provide debug output as we go laong
print(f'number of granules: {api_granule.hits()}')
api_granule_downloadable = api_granule.downloadable()
print(f'number of downloadable granules: f{api_granule_downloadable.downloadable().hits()}')

# retrieve all the granules
granules = api_granule.get_all()

# Find list of all downloadable URLs for the granules
url_list = []
# FIXME: Remove numpy use?
for i in range(0, np.shape(granules)[0]):
for element in granules[i]['links']:
if element['rel'] == 'http://esipfed.org/ns/fedsearch/1.1/data#':
print('adding url: ' + element['href'])
url_list.append(element['href'])
break
else:
# FIXME: Provide useful info here
print('no downloadable url found')

# We need to provide EarthData credentials to fetch the files.
# The credentials of the currently logged in user are used, and passed on to the cloud
# as well when the operation is scaled out. This shall be automated with a machine identity
# in the future.
# go here to set up .netrc file: https://disc.gsfc.nasa.gov/data-access
username, _, password = netrc.netrc().authenticators('urs.earthdata.nasa.gov')
client_kwargs = {
'auth': aiohttp.BasicAuth(username, password),
'trust_env': True,
}

# Now we create the Pangeo Forge Recipe!
# The output will be a cloud analysis ready Zarr archive, created with xarray
recipe = XarrayZarrRecipe(
pattern_from_file_sequence( # The pattern of input files to check
url_list, # List of URLs pointing to our input files, fetched earlier.
concat_dim='time', # TODO: What does this do?
nitems_per_file=1, # TODO: What does this do?
fsspec_open_kwargs=dict(
client_kwargs=client_kwargs
), # Pass our earthdata credentials through to FSSpec, so we can authenticate & fetch data
),
inputs_per_chunk=10, # TODO: What does this do?
recipe = (
beam.Create(files.items())
| OpenURLWithEarthDataLogin()
| OpenWithXarray()
| StoreToZarr(
store_name='gpm-3imergdl',
target_chunks={'time': 2},
combine_dims=files.combine_dim_keys,
)
)

3 changes: 3 additions & 0 deletions feedstock/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
git+https://github.com/yuvipanda/pangeo-forge-cmr@logging
git+https://github.com/yuvipanda/pangeo-forge-earthdatalogin@main
git+https://github.com/yuvipanda/pangeo-forge-recipes@subpath
105 changes: 105 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import pytest
import os
import pathlib
import xarray
import copy
import subprocess
import json
import tempfile
import time

ROOT_DIR = pathlib.Path(__file__).parent.parent
ZARR_STORE_NAME = os.environ.get("ZARR_STORE_NAME", "gpm-3imergdl")



@pytest.fixture(scope="module")
def zarr_store_root() -> str:
job_name = os.environ.get("JOB_NAME", f"test-{str(int(time.time()))}")

config = {
"BaseCommand": {
"logging_config": {
"version": 1,
"formatters": {
"simple": {
"format": "%(asctime)s %(levelname)s %(message)s"
}
},
"handlers": {
"recipe_parse_debug": {
"class": "logging.FileHandler",
"level": "DEBUG",
"formatter": "simple",
"filename": f"{job_name}_parse.debug.log"
},
"recipe_parse_info": {
"class": "logging.FileHandler",
"level": "INFO",
"formatter": "simple",
"filename": f"{job_name}_parse.info.log"
},
"beam_info_log": {
"class": "logging.FileHandler",
"level": "INFO",
"formatter": "simple",
"filename": f"{job_name}_beam.info.log"
},
"beam_debug_log": {
"class": "logging.FileHandler",
"level": "DEBUG",
"formatter": "simple",
"filename": f"{job_name}_beam.debug.log"
}
},
"loggers": {
"pangeo_forge_recipes.parse": {
"level": "DEBUG",
"handlers": ["recipe_parse_debug", "recipe_parse_info"]
},
"apache_beam": {
"level": "DEBUG",
"handlers": ["beam_info_log", "beam_debug_log"]
}
}
}
},
"TargetStorage": {
"fsspec_class": "fsspec.implementations.local.LocalFileSystem",
"root_path": f"file://{ROOT_DIR}/storage/output/{job_name}"
},
"InputCacheStorage": {
"fsspec_class": "fsspec.implementations.local.LocalFileSystem",
"root_path": f"file://{ROOT_DIR}/storage/cache"
},
"Bake": {
"bakery_class": "pangeo_forge_runner.bakery.local.LocalDirectBakery",
"job_name": job_name
}
}
with tempfile.NamedTemporaryFile(suffix=".json") as f:
print(config)
f.write(json.dumps(config).encode())
f.flush()
subprocess.check_call([
"pangeo-forge-runner",
"bake",
"--prune",
"-f",
f.name,
"--repo",
ROOT_DIR
])

return config["TargetStorage"]["root_path"]



# Not module scoped, as we want to provide a new array for each test
@pytest.fixture
def zarr_store(zarr_store_root: str) -> xarray.Dataset:
return xarray.open_dataset(
os.path.join(zarr_store_root, ZARR_STORE_NAME),
engine="zarr"
)

26 changes: 26 additions & 0 deletions tests/test_gpm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import xarray


def test_doi(zarr_store: xarray.Dataset):
"""
Verify the DOI attribute is set correctly
"""
assert zarr_store.attrs["DOI"] == "10.5067/GPM/IMERGDL/DAY/06"


def test_coverage(zarr_store: xarray.Dataset):
"""
Verify that we have all the lats & lons
"""
assert len(zarr_store.lat) == 1800
assert len(zarr_store.lon) == 3600


def test_variables(zarr_store: xarray.Dataset):
"""
Verify that the variables we care about are in here
"""
expected_vars = ['HQprecipitation', 'HQprecipitation_cnt', 'HQprecipitation_cnt_cond', 'lat', 'lon', 'precipitationCal', 'precipitationCal_cnt', 'precipitationCal_cnt_cond', 'randomError', 'randomError_cnt', 'time', 'time_bnds']
for var in expected_vars:
assert var in list(zarr_store.variables)