giocaizzi / py3dinterpolations Goto Github PK

View Code? Open in Web Editor NEW

1.0 1.0 0.0 27.79 MB

Home Page: https://giocaizzi.github.io/py3dinterpolations/

License: MIT License

Python 100.00%

3d gis interpolation kriging pykrige python sklearn spatial-analysis spatial-data

py3dinterpolations's Introduction

Hi there 👋

Contact:

py3dinterpolations's People

Contributors

Stargazers

Watchers

py3dinterpolations's Issues

test plotting: assert each axes has 2 lines

https://api.github.com/giocaizzi/py3dinterpolations/blob/4544c573a8e987c599122c5a0906450b7e2bfa18/tests/plotting/test_plotting.py#L43

    ).preprocess()

    # patch the reverse_preprocessing function
    # returning  a GriddData object
    # created with a sample of the test_data
    with patch(
        "py3dinterpolations.plotting.plotting.reverse_preprocessing",
        return_value=GridData(test_data.sample(n=50, random_state=42)),
    ):
        fig = plot_downsampling(gd, preprocessed_gd)

    # assert is figure
    assert isinstance(fig, Figure)

    # assert figure has n visible axes
    n = len(gd.data.index.get_level_values("ID").unique())
    visible_axes = [ax for ax in fig.axes if ax.get_visible()]
    assert len(visible_axes) == n

    # # TODO assert each axes has 2 lines

make this a custom sklearn class

https://api.github.com/giocaizzi/py3dinterpolations/blob/2d3421ec588bf4e443f9b1326bfca15251d7b48e/py3dinterpolations/modelling/models/deterministic.py#L5

import numpy as np

# TODO: make this a custom sklearn class


class DeterministicModel:
    """base class for all deterministic models

plotting functions : add 2d plotting along specified axis (maps and sections)

add 2d plotting along specified axis (maps and sections)
#34

krige wrapper, method key param

that is not needed for the estimator

find a flexible way to handle this, probably a good way is to use directly

the sci-kit wrapper Krige()

https://api.github.com/giocaizzi/py3dinterpolations/blob/1edd1fcdb4b5241757cdc617d0adf5732a5f9a2e/py3dinterpolations/modelling/interpolate.py#L105

"""interpolator"""

from typing import Union, Tuple
import numpy as np

from ..core.griddata import GridData
from ..core import create_regulargrid3d_from_griddata
from .modeler import Modeler
from .preprocessor import Preprocessor
from .estimator import Estimator


def interpolate(
    griddata: GridData,
    model_name: str,
    grid_resolution: float,
    model_params: dict = {},
    model_params_grid: dict = {},
    preprocess_kwags: dict = {},
    predict_kwags: dict = {},
    return_model: bool = False,
) -> Union[np.ndarray, Tuple[np.ndarray, Modeler]]:
    """interpolate griddata

    Interpolate griddata using a Modeler instance that wraps all supported
    models. The model is selected with the argument `model_name`.

    If the `model_params` is passed, then the model is initialized with those
    parameters. Otherwise, to make a search for the best parameters, use the
    `model_params_grid` argument.

    The 3D grid for interpolation is retrived from the training data.
    At the moment features only a regular grid.

    If requested, the griddata is preprocessed using the Preprocessor class.

    Args:
        griddata (GridData): griddata to interpolate
        model_name (str): model name
        grid_resolution (float): grid resolution
        model_params (dict, optional): model parameters. Defaults to {}.
        preprocess_kwags (dict, optional): preprocessing parameters. Defaults to {}.
        predict_kwags (dict, optional): prediction parameters. Defaults to {}.
        return_model (bool, optional): return model. Defaults to False.

    Returns:
        Union[np.ndarray, Tuple[np.ndarray, Modeler]]:
            interpolated griddata, optionally with model

    Raises:
        ValueError: either model_params or model_params_grid must be passed
        ValueError: model_params and model_params_grid cannot be passed together
        NotImplementedError: only RegularGrid3D is supported.
        NotImplementedError: Parameter search is only supported for ordinary_kriging

    Examples:
        >>> # interpolate griddata
        >>> interpolated = interpolate(
        >>>     griddata,
        >>>     model_name="ordinary_kriging",
        >>>     grid_resolution=5,
        >>>     model_params={
        >>>         "variogram_model": "linear",
        >>>         "nlags": 6,
        >>>         "weight": True,
        >>>     },
        >>>     preprocess_kwags={
        >>>         "downsampling_res": 0.1,
        >>>         "normalize_xyz": True,
        >>>         "standardize_v": True,
        >>>     },
        >>> )

    """
    # check model_params and model_params_grid
    if model_params == {} and model_params_grid == {}:
        raise ValueError("either model_params or model_params_grid must be passed")
    if model_params != {} and model_params_grid != {}:
        raise ValueError("model_params and model_params_grid cannot be passed together")

    # check grid_resolution is of supported type
    if isinstance(grid_resolution, float) or isinstance(grid_resolution, int):
        # retrive associated grid
        grid3d = create_regulargrid3d_from_griddata(griddata, grid_resolution)
    else:
        raise NotImplementedError("only RegularGrid3D is supported.")

    # preprocess griddata if needed
    if preprocess_kwags != {}:
        # preprocessor
        preprocessor = Preprocessor(griddata, **preprocess_kwags)
        # get new griddata
        griddata = preprocessor.preprocess()

    if model_params == {}:
        # implemented only for ordinary_kriging
        if model_name != "ordinary_kriging":
            raise NotImplementedError(
                "Parameter search is only supported for ordinary_kriging"
            )

        # estimate
        est = Estimator(griddata, model_params_grid)

        # TODO: krige wrapper, method key param
        #   that is not needed for the estimator
        #   find a flexible way to handle this, probably a good way is to use directly
        #   the sci-kit wrapper Krige()
        model_params = est.best_params
        model_params.pop("method")

    # init Modeler
    model = Modeler(
        griddata=griddata,
        grid3d=grid3d,
        model_name=model_name,
        model_params=model_params,
    )

    # make predictions
    predictions = model.predict(**predict_kwags)

    # return model
    if return_model:
        return predictions, model
    else:
        return predictions

reshape_pykrige and numpy conventions

pykrige should not require a reshape

https://api.github.com/giocaizzi/py3dinterpolations/blob/fe1ba357af32f67cf72cd6a5cfa0c24b14acc115/py3dinterpolations/modelling/modeler.py#L101

            )

        # reshape fron zxy to xyz
        # TODO: reshape_pykrige
        #   pykrige should not require a reshape
        if self.model._model_name == "ordinary_kriging":
            # reshape pykrige output
            interpolated = _reshape_pykrige(interpolated)

pickled model

when pickling the model, the mesh attribute of grid3d is empty, although was calculated initially

IDW: Remove loopings use a vectorized approach

Args:

x (np.ndarray): x coordinates of the points where the model will be evaluated

y (np.ndarray): y coordinates of the points where the model will be evaluated

z (np.ndarray): z coordinates of the points where the model will be evaluated

values (np.ndarray): values of the points where the model will be evaluated

power (float): power of the IDW model

Attributes:

power (float): power of the IDW model

distance_matrix (np.ndarray): distance matrix between

observations and data points

https://api.github.com/giocaizzi/py3dinterpolations/blob/4544c573a8e987c599122c5a0906450b7e2bfa18/py3dinterpolations/modelling/models/idw.py#L18

"""Inverse Distance Weighting (IDW) model"""

import numpy as np
import math

from .deterministic import DeterministicModel


class IDW(DeterministicModel):
    """Simple IDW Model object

    IDW: Inverse Distance Weigthting

    Offers tweaking of the power parameter of the IDW model.

    Could be really slow for large datasets, due to the use of loops.

    TODO: Remove loopings use a vectorized approach

    Args:
        x (np.ndarray): x coordinates of the points where the model will be evaluated
        y (np.ndarray): y coordinates of the points where the model will be evaluated
        z (np.ndarray): z coordinates of the points where the model will be evaluated
        values (np.ndarray): values of the points where the model will be evaluated
        power (float): power of the IDW model

    Attributes:
        power (float): power of the IDW model
        distance_matrix (np.ndarray): distance matrix between
            observations and data points
    """

    def __init__(
        self,
        x: np.ndarray,
        y: np.ndarray,
        z: np.ndarray,
        values: np.ndarray,
        power: float = 1,
    ):
        # initialize parent class
        super().__init__(x=x, y=y, z=z, values=values)

        # set power
        self.power = power

    def _compute_point(
        self,
        x: float,
        y: float,
        z: float,
        power: float,
        threshold: float = 0.0000000001,
    ) -> float:
        """find value at a point

        Loops through all the data points to estimate the value at a point
        based on the Inverse Distance Weighting (IDW) method.
        Might be really slow for large datasets.

        This implementation however offers easy advanced parametrization with:
        - n_points: number of points to use for the IDW estimation (TODO)
        - max_distance: maximum distance to use for the IDW estimation (TODO)
        """
        nominator = 0
        denominator = 0

        # distance in 3d
        for i in range(0, len(self.values)):
            dist = math.sqrt(
                (x - self.x[i]) * (x - self.x[i])
                + (y - self.y[i]) * (y - self.y[i])
                + (z - self.z[i]) * (z - self.z[i])
            )

            # If the point is really close to one of the data points,
            # return the data point value to avoid singularities
            # EXACT interpolations
            if dist < threshold:
                return self.values[i]

            nominator = nominator + (self.values[i] / pow(dist, power))
            denominator = denominator + (1 / pow(dist, power))

        # Return NaN if the denominator is zero
        if denominator > 0:
            value = nominator / denominator
        else:
            value = np.nan
        return value

    def compute(
        self, gridx: np.ndarray, gridy: np.ndarray, gridz: np.ndarray
    ) -> np.ndarray:
        meshx, meshy, meshz = np.meshgrid(gridx, gridy, gridz, indexing="xy")

        # Create a new mesh to store the coordinates
        new_mesh = np.zeros_like(meshx)

        # Iterate over the indices of the arrays
        for i in range(meshx.shape[0]):
            for j in range(meshx.shape[1]):
                for k in range(meshx.shape[2]):
                    # Store the coordinates in the new mesh
                    new_mesh[i, j, k] = self._compute_point(
                        x=meshx[i, j, k],
                        y=meshy[i, j, k],
                        z=meshz[i, j, k],
                        power=self.power,
                    )
        return new_mesh

v0.4.2 tests fail on py3.7

prechek documentation on pull requests to main

run a full documentation buil precheck on pull requests on main

avoid that the documentation break,

assert something if downsampling is not None

https://api.github.com/giocaizzi/py3dinterpolations/blob/fe1ba357af32f67cf72cd6a5cfa0c24b14acc115/tests/modelling/test_preprocessing.py#L161

    else:
        pytest.skip("NOT IMPLEMENTED YET")

    # TODO: assert something if downsampling is not None

documentation build fails

execute notebooks to verify them

run all notebooks to:

avoid code changes to brake code

package released on PyPI is empty

contains only init.py

interpolate function

techincally returning "interpolated" is not required as it is stored in model.results

Restructured interpolate

Implement IDW parametrization

This implementation however offers easy advanced parametrization with:

n_points: number of points to use for the IDW estimation
max_distance: maximum distance to use for the IDW estimation

https://api.github.com/giocaizzi/py3dinterpolations/blob/fe1ba357af32f67cf72cd6a5cfa0c24b14acc115/py3dinterpolations/modelling/models/idw.py#L61

"""Inverse Distance Weighting (IDW) model"""

import numpy as np
import math

from .deterministic import DeterministicModel


class IDW(DeterministicModel):
    """Simple IDW Model object

    IDW: Inverse Distance Weigthting

    Offers tweaking of the power parameter of the IDW model.

    Could be really slow for large datasets, due to the use of loops.

    TODO: Remove loopings use a vectorized approach

    Args:
        x (np.ndarray): x coordinates of the points where the model will be evaluated
        y (np.ndarray): y coordinates of the points where the model will be evaluated
        z (np.ndarray): z coordinates of the points where the model will be evaluated
        values (np.ndarray): values of the points where the model will be evaluated
        power (float): power of the IDW model

    Attributes:
        power (float): power of the IDW model
        distance_matrix (np.ndarray): distance matrix between
            observations and data points
    """

    def __init__(
        self,
        x: np.ndarray,
        y: np.ndarray,
        z: np.ndarray,
        values: np.ndarray,
        power: float = 1,
    ):
        # initialize parent class
        super().__init__(x=x, y=y, z=z, values=values)

        # set power
        self.power = power

    def _compute_point(
        self,
        x: float,
        y: float,
        z: float,
        power: float,
        threshold: float = 0.0000000001,
    ) -> float:
        """find value at a point

        Loops through all the data points to estimate the value at a point
        based on the Inverse Distance Weighting (IDW) method.
        Might be really slow for large datasets.

        TODO: Implement parametrization
            This implementation however offers easy advanced parametrization with:
            - n_points: number of points to use for the IDW estimation
            - max_distance: maximum distance to use for the IDW estimation
        """
        nominator = 0
        denominator = 0

        # distance in 3d
        for i in range(0, len(self.values)):
            dist = math.sqrt(
                (x - self.x[i]) * (x - self.x[i])
                + (y - self.y[i]) * (y - self.y[i])
                + (z - self.z[i]) * (z - self.z[i])
            )

            # If the point is really close to one of the data points,
            # return the data point value to avoid singularities
            # EXACT interpolations
            if dist < threshold:
                return self.values[i]

            nominator = nominator + (self.values[i] / pow(dist, power))
            denominator = denominator + (1 / pow(dist, power))

        # Return NaN if the denominator is zero
        if denominator > 0:
            value = nominator / denominator
        else:
            value = np.nan
        return value

    def compute(
        self, gridx: np.ndarray, gridy: np.ndarray, gridz: np.ndarray
    ) -> np.ndarray:
        meshx, meshy, meshz = np.meshgrid(gridx, gridy, gridz, indexing="xy")

        # Create a new mesh to store the coordinates
        new_mesh = np.zeros_like(meshx)

        # Iterate over the indices of the arrays
        for i in range(meshx.shape[0]):
            for j in range(meshx.shape[1]):
                for k in range(meshx.shape[2]):
                    # Store the coordinates in the new mesh
                    new_mesh[i, j, k] = self._compute_point(
                        x=meshx[i, j, k],
                        y=meshy[i, j, k],
                        z=meshz[i, j, k],
                        power=self.power,
                    )
        return new_mesh

add option to downsample by selecting the statistic

taking the median, or the max

Args:

data (pd.DataFrame): data to downsample

Returns:

pd.DataFrame: downsampled data

https://api.github.com/giocaizzi/py3dinterpolations/blob/1edd1fcdb4b5241757cdc617d0adf5732a5f9a2e/py3dinterpolations/modelling/preprocessor.py#L141

"""preprocessor module"""

import pandas as pd

from ..core import GridData
from typing import Union


class Preprocessor:
    """Preprocessor class

    Preprocess data before interpolation. Preprocessing includes:
        - Downsampling, by taking the mean of blocks of given resolution
        - Normalization of X Y Z
        - Standardization of V

    By calling the preprocess method, the data is preprocessed and a new
    GridData object is returned with the preprocessed data and the
    preprocessing parameters set.

    Args:
        griddata (GridData): GridData object to preprocess
        downsampling_res (Union[float, None]): resolution to downsample data
            by taking the mean of blocks of given resolution. If None, no
            downsampling is applied. Default is None.
        normalize_xyz (bool): whether to normalize X Y Z. Default is True.
        standardize_v (bool): whether to standardize V. Default is True.

    Attributes:
        griddata (GridData): GridData object to preprocess
        downsampling_res (Union[float, None]): resolution to downsample data
            by taking the mean of blocks of given resolution. If None, no
            downsampling is applied. Default is None.
        normalize_xyz (bool): whether to normalize X Y Z. Default is True.
        standardize_v (bool): whether to standardize V. Default is True.
        preprocessor_params (dict): dictionary with the parameters of the
            preprocessing. It is set after calling the preprocess method.::

                preprocessor_params = {
                    "downsampling": {
                        "resolution": downsampling_res,
                    },
                    "normalization": {
                        "X": {
                            "min": min_value_of_X,
                            "max": max_value_of_X,
                        },
                        "Y": {
                            "min": min_value_of_Y,
                            "max": max_value_of_Y,
                        },
                        "Z": {
                            "min": min_value_of_Z,
                            "max": max_value_of_Z,
                        },
                    },
                    "standardization": {
                        "mean": mean_value_of_V,
                        "std": std_value_of_V,
                    },

                }

    Examples:
        >>> # preprocess data
        >>> preprocessor = Preprocessor(
        >>>     griddata,
        >>>     downsampling_res=0.1,
        >>>     normalize_xyz=True,
        >>>     standardize_v=True,
        >>> )
        >>> griddata = preprocessor.preprocess()
    """

    preprocessor_params = {}

    def __init__(
        self,
        griddata: GridData,
        downsampling_res: Union[float, None] = None,
        normalize_xyz: bool = True,
        standardize_v: bool = True,
    ):
        self.griddata = griddata
        self.downsampling_res = downsampling_res
        self.normalize_xyz = normalize_xyz
        self.standardize_v = standardize_v

    def preprocess(self) -> GridData:
        """preprocess data

        Returns:
            GridData: new GridData object with the preprocessed data
                and prerpocessing parameters set

        """
        # get data
        data = self.griddata.data.copy().reset_index()[["ID", "X", "Y", "Z", "V"]]

        # first dowmsample
        if self.downsampling_res is not None:
            data = self._downsample_data(data)

        # normalize
        if self.normalize_xyz:
            data = self._normalize_xyz(data)

        # standardize
        if self.standardize_v:
            data = self._standardize_v(data)

        # return new object with preprocessed data and parameters
        return GridData(data, preprocessor_params=self.preprocessor_params)

    def _normalize_xyz(self, data: pd.DataFrame) -> pd.DataFrame:
        """apply normalization to X Y Z

        Args:
            data (pd.DataFrame): data to normalize

        Returns:
            pd.DataFrame: normalized data
        """
        df = data.copy()
        self.preprocessor_params["normalization"] = {}
        for axis in ["X", "Y", "Z"]:
            df[axis], axis_params = _normalize(df[axis])
            self.preprocessor_params["normalization"][axis] = axis_params
        return df

    def _standardize_v(self, data: pd.DataFrame) -> pd.DataFrame:
        """apply standardization to V"""
        df = data.copy()
        df["V"], params = _standardize(df["V"])
        self.preprocessor_params["standardization"] = params
        return df

    def _downsample_data(self, data: pd.DataFrame) -> pd.DataFrame:
        """downsample data making the average by blocks of given resolution

        TODO: add option to downsample by selecting the statistic
            taking the median, or the max

        Args:
            data (pd.DataFrame): data to downsample

        Returns:
            pd.DataFrame: downsampled data
        """
        self.preprocessor_params["downsampling"] = {"resolution": self.downsampling_res}
        idfs = []
        # loop over unique ids
        for id in self.griddata.data.index.get_level_values("ID").unique():
            # filter by id
            idf = self.griddata.data.loc[
                self.griddata.data.index.get_level_values("ID") == id
            ].reset_index()
            # save x,y values
            x = idf["X"][0]
            y = idf["Y"][0]
            # extract z
            idf = idf[["Z", "V"]]
            # downsample by grouping in blocks of given resolution
            # and taking the mean
            idf = idf.groupby(
                idf["Z"].apply(
                    lambda x: self.preprocessor_params["downsampling"]["resolution"]
                    * round(x / self.preprocessor_params["downsampling"]["resolution"])
                )
            )[["V"]].mean()
            # new downsampled df
            idf["X"] = x
            idf["Y"] = y
            idf["ID"] = id
            idf.reset_index(inplace=True)  # reset index resulting from groupby
            idfs.append(idf)
        # return downsampled grid data

        return pd.concat(idfs)


def _standardize(series: pd.Series) -> tuple:
    """standardize series to have mean 0 and std 1

    Args:
        series (pd.Series): series to standardize

    Returns:
        tuple: standardized series and standardization parameters
    """
    series = series.copy()
    # save standardization parameters
    params = {
        "mean": series.mean(),
        "std": series.std(),
    }
    # standardize
    series = (series - params["mean"]) / (params["std"])
    return series, params


def _normalize(series: pd.Series) -> tuple:
    """normalize series between 0 and 1

    Args:
        series (pd.Series): series to normalize

    Returns:
        tuple: normalized series and normalization parameters
    """
    series = series.copy()
    # save normalization parameters
    params = {
        "min": series.min(),
        "max": series.max(),
    }
    # normalize
    series = (series - params["min"]) / (params["max"] - params["min"])
    return series, params


def reverse_preprocessing(griddata: GridData) -> GridData:
    """reverse preprocessing of whole GridData object

    reverse all reversible transformations that have been
    applied to the GridData object.

    This method reverses the operations of preprocessing of:
        - Normalization of X Y Z
        - Standardization of V

    Note:
        It cannot reverse downsampling!

    Returns:
        GridData: GridData object with the data with reversed preprocessing
    """
    if griddata.preprocessor_params is None:
        raise ValueError("No preprocessing has been applied to the data")
    else:
        # get data
        data = griddata.data.copy().reset_index()
        if "normalization" in griddata.preprocessor_params:
            # reverse normalization of X Y Z
            # original_value = normalized_value * (max_value - min_value) + min_value
            for axis in ["X", "Y", "Z"]:
                data[axis] = (
                    data[axis]
                    * (
                        griddata.preprocessor_params["normalization"][axis]["max"]
                        - griddata.preprocessor_params["normalization"][axis]["min"]
                    )
                    + griddata.preprocessor_params["normalization"][axis]["min"]
                )
        if "standardization" in griddata.preprocessor_params:
            # reverse standardization of V
            data["V"] = (
                data["V"] * griddata.preprocessor_params["standardization"]["std"]
                + griddata.preprocessor_params["standardization"]["mean"]
            )
            # return
        return GridData(data)