Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,8 @@ This command downloads the official BERVO ontology from Google Sheets and is use

Commands for working with [ESS-DIVE](https://ess-dive.lbl.gov/) environmental science datasets.

**Requires:** `ESSDIVE_TOKEN` environment variable ([get access](https://docs.ess-dive.lbl.gov/programmatic-tools/ess-dive-dataset-api#get-access))
Public ESS-DIVE metadata can be retrieved without a token. Set `ESSDIVE_TOKEN`
only when you need authenticated access to non-public datasets.

#### get-essdive-metadata

Expand Down Expand Up @@ -551,7 +552,7 @@ This downloads the latest BERVO from Google Sheets, making it easy to keep your
### Environment Variables

```bash
# Required for ESS-DIVE commands
# Optional for ESS-DIVE commands that need authenticated access
export ESSDIVE_TOKEN=your_token_here

# Required for embedding generation (CurateGPT)
Expand All @@ -560,6 +561,9 @@ export OPENAI_API_KEY=your_api_key_here

### Getting ESS-DIVE Access

Public ESS-DIVE metadata requests do not require authentication. For access to
non-public datasets:

1. Visit https://docs.ess-dive.lbl.gov/programmatic-tools/ess-dive-dataset-api#get-access
2. Follow the authentication instructions
3. Set your token as shown above
Expand Down Expand Up @@ -595,7 +599,11 @@ pip install matplotlib seaborn scikit-learn scipy duckdb

## Troubleshooting

### "ESSDIVE_TOKEN is not set"
### Authenticated ESS-DIVE Access

Public ESS-DIVE metadata commands can run without `ESSDIVE_TOKEN`. Set a token
only when accessing datasets that require authentication:

```bash
export ESSDIVE_TOKEN=your_token_here
```
Expand Down
9 changes: 0 additions & 9 deletions src/trowel/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,6 @@

# Get token from environment
ESSDIVE_TOKEN = os.getenv("ESSDIVE_TOKEN")
if not ESSDIVE_TOKEN:
logging.error(
"You must set the ESS-DIVE authentication token as the ESSDIVE_TOKEN environment variable."
"\nSee https://docs.ess-dive.lbl.gov/programmatic-tools/ess-dive-dataset-api#get-access"
)


@click.group()
Expand Down Expand Up @@ -129,10 +124,6 @@ def get_essdive_metadata(path, outpath):
with open(path, "r") as f:
identifiers = f.readlines()

if not ESSDIVE_TOKEN:
logging.error("ESSDIVE_TOKEN is not set. Cannot proceed.")
sys.exit(1)

results_path, frequencies_path, filetable_path = get_metadata(
identifiers, ESSDIVE_TOKEN, outpath)

Expand Down
51 changes: 36 additions & 15 deletions src/trowel/wrappers/essdive.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# See https://api.ess-dive.lbl.gov/#/Dataset/getDataset

from io import StringIO
import sys
import string
import os
import tempfile
Expand All @@ -12,6 +11,7 @@
import csv
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import quote
import polars as pl
import requests
import xml.etree.ElementTree as ET
Expand Down Expand Up @@ -49,6 +49,31 @@
logger = logging.getLogger(__name__)


def normalize_essdive_token(token: Optional[str]) -> Optional[str]:
"""Return a usable ESS-DIVE token, or None for anonymous public access."""
if token is None:
return None

normalized = token.strip()
if not normalized or normalized.lower() in {"null", "none"}:
return None
return normalized


def _essdive_headers(token: Optional[str]) -> dict:
"""Build ESS-DIVE request headers, omitting auth when no token exists."""
normalized_token = normalize_essdive_token(token)
if not normalized_token:
return {}
return {"Authorization": f"Bearer {normalized_token}"}


def _package_url(identifier: str) -> str:
"""Build the current ESS-DIVE package endpoint URL for an identifier."""
encoded_identifier = quote(identifier, safe="")
return f"{BASE_URL}/{ENDPOINT}/{encoded_identifier}"


def sanitize_tsv_field(value) -> str:
"""Normalize a value destined for TSV output.

Expand Down Expand Up @@ -196,17 +221,18 @@ def append_dd_content_to_file(content: str, dataset_id: str, source_filename: st


def get_metadata(
identifiers: list, token: str, outpath: str = "."
identifiers: list, token: Optional[str] = None, outpath: str = "."
) -> Tuple[str, str, str]:
"""Get metadata from ESS-DIVE for a list of identifiers.
The identifiers should be DOIs.
This also requires an authentication token for ESS-DIVE.
The identifiers should be DOIs or ESS-DIVE package identifiers.
Public metadata can be retrieved anonymously; provide an ESS-DIVE token for
authenticated requests.

Results are streamed to files in the specified output directory as they are received.

Args:
identifiers: List of DOI identifiers
token: ESS-DIVE authentication token
token: Optional ESS-DIVE authentication token
outpath: Directory to write output files (defaults to current directory)

Returns:
Expand Down Expand Up @@ -241,7 +267,7 @@ def get_metadata(
files_schema.write_csv(filetable_path, separator="\t")

all_variables = {} # key is variable name, value is frequency
headers = {"Authorization": f"Bearer {token}"}
headers = _essdive_headers(token)

results_found = False
files_found = False
Expand Down Expand Up @@ -269,17 +295,12 @@ def get_metadata(
# clean it up
identifier = identifier.strip()

# Check if this is a DOI anyway
if identifier.startswith("ess-dive"):
sys.exit(
f"The provided identifier {identifier} does not appear to be a DOI. Please check the format."
)
if not identifier.startswith("doi:"):
# Keep DOI inputs backward-compatible, but the current API also accepts
# ESS-DIVE package identifiers at /packages/{identifier}.
if not identifier.startswith("doi:") and not identifier.startswith("ess-dive"):
identifier = "doi:" + identifier

get_packages_url = "{}/{}/{}?&isPublic=true".format(
BASE_URL, ENDPOINT, identifier
)
get_packages_url = _package_url(identifier)
response = requests.get(
get_packages_url, headers=headers, verify=True, stream=True)

Expand Down
68 changes: 64 additions & 4 deletions tests/trowel/wrappers/test_essdive_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,28 @@
import requests
import polars as pl

from trowel.wrappers.essdive import get_metadata, get_variable_names
from trowel.wrappers.essdive import (
get_metadata,
get_variable_names,
normalize_essdive_token,
)


class TestGetMetadata(unittest.TestCase):
"""Test suite for get_metadata function."""

def test_normalize_essdive_token(self):
"""Test ESS-DIVE token normalization for anonymous public access."""
self.assertIsNone(normalize_essdive_token(None))
self.assertIsNone(normalize_essdive_token(""))
self.assertIsNone(normalize_essdive_token(" "))
self.assertIsNone(normalize_essdive_token("null"))
self.assertIsNone(normalize_essdive_token("None"))
self.assertEqual(normalize_essdive_token(" token "), "token")

@patch('requests.get')
def test_get_metadata_success(self, mock_get):
"""Test get_metadata with successful API response."""
def test_get_metadata_success_without_token(self, mock_get):
"""Test get_metadata with successful anonymous API response."""
# Mock response for successful API call
mock_response = MagicMock()
mock_response.status_code = 200
Expand All @@ -42,9 +55,14 @@ def test_get_metadata_success(self, mock_get):
with tempfile.TemporaryDirectory() as temp_dir:
# Call the function
results_path, frequencies_path, filetable_path = get_metadata(
['doi:10.1234/test'], 'fake_token', temp_dir
['doi:10.1234/test'], None, temp_dir
)

request_url = mock_get.call_args[0][0]
request_headers = mock_get.call_args[1]["headers"]
self.assertIn("doi%3A10.1234%2Ftest", request_url)
self.assertNotIn("Authorization", request_headers)

# Check if files were created
self.assertTrue(os.path.exists(results_path))
self.assertTrue(os.path.exists(frequencies_path))
Expand All @@ -70,6 +88,48 @@ def test_get_metadata_success(self, mock_get):
# pH is normalized to lowercase
self.assertIn('ph', frequencies_content)

@patch('requests.get')
def test_get_metadata_includes_real_token(self, mock_get):
"""Test get_metadata includes Authorization for a real token."""
mock_response = MagicMock()
mock_response.status_code = 404
mock_get.return_value = mock_response

with tempfile.TemporaryDirectory() as temp_dir:
get_metadata(['10.1234/test'], ' fake_token ', temp_dir)

request_headers = mock_get.call_args[1]["headers"]
self.assertEqual(
request_headers["Authorization"],
"Bearer fake_token",
)

@patch('requests.get')
def test_get_metadata_omits_null_like_token(self, mock_get):
"""Test get_metadata omits Authorization for null-like token values."""
mock_response = MagicMock()
mock_response.status_code = 404
mock_get.return_value = mock_response

with tempfile.TemporaryDirectory() as temp_dir:
get_metadata(['10.1234/test'], 'null', temp_dir)

request_headers = mock_get.call_args[1]["headers"]
self.assertNotIn("Authorization", request_headers)

@patch('requests.get')
def test_get_metadata_accepts_essdive_identifier(self, mock_get):
"""Test get_metadata can request an ESS-DIVE package identifier."""
mock_response = MagicMock()
mock_response.status_code = 404
mock_get.return_value = mock_response

with tempfile.TemporaryDirectory() as temp_dir:
get_metadata(['ess-dive-test-id'], None, temp_dir)

request_url = mock_get.call_args[0][0]
self.assertTrue(request_url.endswith("/packages/ess-dive-test-id"))

@patch('requests.get')
def test_get_metadata_error(self, mock_get):
"""Test get_metadata with API error."""
Expand Down
Loading