Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion docs/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -57,4 +57,16 @@ Tooling to determine the optimal bandwidths of geographically weighted models.
.. autosummary::
:toctree: generated/

BandwidthSearch
BandwidthSearch

Decomposition
-------------

Geographically weighted decomposition models.

.. currentmodule:: spatialml.decomposition
.. autosummary::
:toctree: generated/

BaseDecomposition
GWPCA
1 change: 1 addition & 0 deletions gwpca_demo.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion spatialml/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import contextlib
from importlib.metadata import PackageNotFoundError, version

from . import base, ensemble, linear_model, search
from . import base, decomposition, ensemble, linear_model, search

with contextlib.suppress(PackageNotFoundError):
__version__ = version("spatialml")
134 changes: 100 additions & 34 deletions spatialml/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,18 @@
from joblib import dump, load
from libpysal import graph
from scipy.spatial import KDTree
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.base import (
BaseEstimator,
ClassifierMixin,
RegressorMixin,
)
from sklearn.model_selection import train_test_split
from sklearn.utils.parallel import Parallel, delayed

__all__ = ["BaseClassifier", "BaseRegressor"]
__all__ = [
"BaseClassifier",
"BaseRegressor",
]


def _triangular(distances: np.ndarray, bandwidth: np.ndarray | float) -> np.ndarray:
Expand Down Expand Up @@ -181,13 +188,18 @@ def _setup_model_storage(self):
def _fit_models_batch(
self,
X: pd.DataFrame,
y: pd.Series,
y: pd.Series | None,
weights: graph.Graph,
) -> list:
"""Fit models in batches or all at once"""
"""Fit models in batches or all at once.

When ``y is None`` (unsupervised path, e.g. :class:`BaseDecomposition`),
no target is injected into the per-neighbourhood frames and no
invariance check is performed.
"""
if self.batch_size:
training_output = []
num_groups = len(y)
num_groups = len(X)
indices = X.index
for i in range(0, num_groups, self.batch_size):
if self.verbose:
Expand Down Expand Up @@ -217,29 +229,38 @@ def _fit_models_batch(
def _batch_fit(
self,
X: pd.DataFrame,
y: pd.Series,
y: pd.Series | None,
index: pd.MultiIndex,
_weight: np.ndarray,
X_focals: np.ndarray,
) -> list:
"""Fit a batch of local models"""
"""Fit a batch of local models.

When ``y`` is provided (supervised path), the per-neighbourhood frame
carries a ``_y`` column and an invariance check is run. When ``y is
None`` (unsupervised path used by :class:`BaseDecomposition`), neither
the ``_y`` injection nor the invariance check is applied — local fits
only need ``X`` and ``_weight``.
"""
data = X.copy()
data["_y"] = y
if y is not None:
data["_y"] = y
data = data.loc[index.get_level_values(1)]
data["_weight"] = _weight
grouper = data.groupby(index.get_level_values(0), sort=False)

invariant = grouper["_y"].nunique() == 1
if invariant.any():
if self.strict:
raise ValueError(
f"y at locations {invariant.index[invariant]} is invariant."
)
elif self.strict is None:
warnings.warn(
f"y at locations {invariant.index[invariant]} is invariant.",
stacklevel=3,
)
if y is not None:
invariant = grouper["_y"].nunique() == 1
if invariant.any():
if self.strict:
raise ValueError(
f"y at locations {invariant.index[invariant]} is invariant."
)
elif self.strict is None:
warnings.warn(
f"y at locations {invariant.index[invariant]} is invariant.",
stacklevel=3,
)

return Parallel(n_jobs=self.n_jobs, temp_folder=self.temp_folder)(
delayed(self._fit_local)(
Expand All @@ -252,8 +273,19 @@ def _batch_fit(
for (name, group), focal_x in zip(grouper, X_focals, strict=False)
)

def _fit_global_model(self, X: pd.DataFrame, y: pd.Series):
"""Fit global baseline model"""
def _fit_global_model(self, X: pd.DataFrame, y: pd.Series | None):
"""Fit global baseline model for supervised estimators.

This base-class implementation handles supervised models (classifiers /
regressors) that require both ``X`` and ``y``. The early return when
``y is None`` is intentional: unsupervised subclasses (e.g. ``GWPCA``)
override this method entirely — ``GWPCA._fit_global_model`` fits a
global :class:`sklearn.decomposition.PCA` using only ``X``, and is
called directly from :meth:`GWPCA.fit` so this base-class path is
never reached for decompositions.
"""
if y is None:
return

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why? Is there a reason we don't fit global PCA in the similar way we fit global estimators?

if self._model_type == "random_forest":
self._model_kwargs["oob_score"] = True
# fit global model as a baseline
Expand Down Expand Up @@ -530,29 +562,40 @@ def _predict_local_ensemble(
def _validate_fit_inputs(
self,
X: pd.DataFrame,
y: pd.Series,
y: pd.Series | None,
geometry: gpd.GeoSeries | None,
) -> None:
"""
Validate input data and configuration parameters before model fitting.

This method performs structural and spatial consistency checks to ensure that:
- Feature matrix `X` and target vector `y` have matching lengths.
- At least one spatial structure (`geometry` or `graph`) is provided.
- The provided geometry, if any, matches the number of observations in `X`.
- For supervised estimators (classifiers / regressors): ``y`` is not ``None``
and has the same length as ``X``. A ``ValueError`` is raised even if the
user accidentally passes ``y=None`` to a supervised model.
- For unsupervised decomposition estimators (e.g. ``GWPCA``): ``y`` is
unconditionally ignored — the check is skipped entirely.
- At least one spatial structure (``geometry`` or ``graph``) is provided.
- The provided geometry, if any, matches the number of observations in ``X``.
- Bandwidth is positive when specified.
- Adaptive bandwidth (`fixed=False`) is an integer.
- Adaptive bandwidth (``fixed=False``) is an integer.

Raises
------
ValueError
If any of the validation conditions fail.
"""
# Length checks
if len(X) != len(y):
raise ValueError(
f"X and y must have the same length. Got {len(X)} and {len(y)}."
)
# For supervised estimators y is mandatory — raise if the caller forgot it.
# Decomposition estimators do not use y; _requires_y reflects this.
if self._requires_y:
if y is None:
raise ValueError(
"y must be provided for supervised estimators "
f"({type(self).__name__}). Got None."
)
if len(X) != len(y):
raise ValueError(
f"X and y must have the same length. Got {len(X)} and {len(y)}."
)

# Geometry presence
if self.graph is None and geometry is None:
Expand Down Expand Up @@ -594,6 +637,18 @@ def _validate_fit_inputs(
"kernel must be either a valid string or a callable function."
)

@property
def _requires_y(self) -> bool:
"""Whether this estimator requires and uses ``y``.

Returns ``True`` for all supervised estimators (classifiers,
regressors) and ``False`` for unsupervised decompositions such as
:class:`BaseDecomposition`. Callers (e.g.
:class:`spatialml.search.BandwidthSearch`) use this flag to
decide whether to pass ``strict`` and other supervised-only kwargs.
"""
return True

# Abstract methods that subclasses must implement
def _fit_local(
self,
Expand All @@ -605,7 +660,12 @@ def _fit_local(
) -> list[Hashable]:
raise NotImplementedError("Subclasses must implement _fit_local")

def fit(self, X: pd.DataFrame, y: pd.Series, geometry: gpd.GeoSeries | None = None):
def fit(
self,
X: pd.DataFrame,
y: pd.Series,
geometry: gpd.GeoSeries | None = None,
):
raise NotImplementedError("Subclasses must implement fit")

def _get_score_data(
Expand Down Expand Up @@ -870,7 +930,10 @@ def __init__(
self._empty_feature_imp = None

def fit(
self, X: pd.DataFrame, y: pd.Series, geometry: gpd.GeoSeries | None = None
self,
X: pd.DataFrame,
y: pd.Series,
geometry: gpd.GeoSeries | None = None,
) -> "BaseClassifier":
"""Fit geographically weighted local classification models.

Expand Down Expand Up @@ -1604,7 +1667,10 @@ def __init__(
self._empty_score_data = (np.array([]), np.array([]))

def fit(
self, X: pd.DataFrame, y: pd.Series, geometry: gpd.GeoSeries | None = None
self,
X: pd.DataFrame,
y: pd.Series,
geometry: gpd.GeoSeries | None = None,
) -> "BaseRegressor":
"""Fit geographically weighted local regression models.

Expand Down
4 changes: 4 additions & 0 deletions spatialml/decomposition/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from ._base import BaseDecomposition
from .pca import GWPCA

__all__ = ["BaseDecomposition", "GWPCA"]
Loading
Loading