r"""
Provide finite difference approxation for the gradient and hessian.
This is basically the same as provided by the library
`numdifftools <https://numdifftools.readthedocs.io/en/latest/index.html.>`_ but it
supports multiprocessing for gradient approximation which is quite useful when
working with large problems (python is slow).
grad = finite_gradient(np.array([1, 1]), rosen)
Chapitre très intéressant:
https://pythonnumericalmethods.berkeley.edu/notebooks/chapter20.02-Finite-Difference-Approximating-Derivatives.html
@author: acollet
"""
import logging
from concurrent.futures import ProcessPoolExecutor
from dataclasses import dataclass
from typing import (
Any,
Callable,
Dict,
Generator,
List,
Optional,
Sequence,
Tuple,
Union,
)
import numpy as np
from pyrtid.utils.types import NDArrayFloat
def rosen(x):
"""Rosenbrock function."""
return (1 - x[0]) ** 2 + 100.0 * (x[1] - x[0] ** 2) ** 2
def rosen_gradient(x):
"""Rosenbrock function first derivative."""
return np.array(
[400 * x[0] ** 3 + (2 - 400 * x[1]) * x[0] - 2, 200 * (x[1] - x[0] ** 2)]
)
def rosen_hessian(x):
"""Rosenbrock function second derivative."""
return np.array(
[[1200 * x[0] ** 2 - 400 * x[1] + 2, -400 * x[0]], [-400 * x[0], 200.0]]
)
[docs]def is_jacobian_correct(
x: np.ndarray,
fm: Callable,
jac: Callable,
fm_args: Optional[Union[Tuple[Any], List[Any]]] = None,
fm_kwargs: Optional[Dict[str, Any]] = None,
jac_args: Optional[Tuple[Any]] = None,
jac_kwargs: Optional[Dict[str, Any]] = None,
accuracy: int = 0,
eps: Optional[float] = None,
max_workers: int = 1,
) -> bool:
"""
Check by finite difference if the Jacobian matrix is correct.
Parameters
----------
x : np.ndarray
The input parameters vector.
fm : Callable
Forward model.
jac : Callable
Jacobian model.
fm_args: Tuple[Any]
Positional arguments for the forward model.
fm_kwargs : Dict[Any, Any]
Keyword arguments for the forward model.
grad_args: Tuple[Any]
Positional arguments for the gradient model.
grad_kwargs : Dict[Any, Any]
Keyword arguments for the gradient model.
accuracy : int, optional
Number of points to use for the finite difference approximation.
Possible values are 0 (2 points), 1 (4 points), 2 (6 points),
3 (8 points). The default is 0 which corresponds to the central
difference scheme (2 points).
eps: float, optional
The epsilon for the computation (h). The default value has been
taken from the C++ implementation of
:cite:`wieschollek2016cppoptimizationlibrary`, and should correspond
to the optimal h taking into account the roundoff errors due to
the machine precision. The default is -2.2204e-6.
max_workers: int
Number of workers used. If different from one, the calculation relies on
multi-processing to decrease the computation time. The default is 1.
Returns
-------
bool
True if the Jacobian is correct, false otherwise.
"""
if jac_args is None:
_jac_args: Tuple[Any] = ()
else:
_jac_args = jac_args
if jac_kwargs is None:
jac_kwargs = {}
actual_jac = jac(x, *_jac_args, **jac_kwargs)
expected_jac = finite_jacobian(
x, fm, fm_args, fm_kwargs, accuracy, eps, max_workers
)
# Handle the gradient case
# if actual_jac.ndim != expected_jac.ndim:
# expected_jac = expected_jac.reshape(*actual_jac.shape)
return is_all_close(actual_jac, expected_jac)
[docs]def is_gradient_correct(
x: np.ndarray,
fm: Callable,
grad: Callable,
fm_args: Optional[Union[Tuple[Any], List[Any]]] = None,
fm_kwargs: Optional[Dict[str, Any]] = None,
grad_args: Optional[Tuple[Any]] = None,
grad_kwargs: Optional[Dict[str, Any]] = None,
accuracy: int = 0,
eps: Optional[float] = None,
max_workers: int = 1,
) -> bool:
"""
Check by finite difference if the gradient is correct.
Parameters
----------
x : np.ndarray
The input parameters vector.
fm : Callable
Forward model.
grad : Callable
Gradient model.
fm_args: Tuple[Any]
Positional arguments for the forward model.
fm_kwargs : Dict[Any, Any]
Keyword arguments for the forward model.
grad_args: Tuple[Any]
Positional arguments for the gradient model.
grad_kwargs : Dict[Any, Any]
Keyword arguments for the gradient model.
accuracy : int, optional
Number of points to use for the finite difference approximation.
Possible values are 0 (2 points), 1 (4 points), 2 (6 points),
3 (4 points). The default is 0 which corresponds to the central
difference scheme (2 points).
eps: float, optional
The epsilon for the computation (h). The default value has been
taken from the C++ implementation of
:cite:`wieschollek2016cppoptimizationlibrary`, and should correspond
to the optimal h taking into account the roundoff errors due to
the machine precision. The default is -2.2204e-6.
max_workers: int
Number of workers used. If different from one, the calculation relies on
multi-processing to decrease the computation time. The default is 1.
Returns
-------
bool
True if the gradient is correct, false otherwise.
"""
return is_jacobian_correct(
x=x,
fm=fm,
jac=grad,
fm_args=fm_args,
fm_kwargs=fm_kwargs,
jac_args=grad_args,
jac_kwargs=grad_kwargs,
accuracy=accuracy,
eps=eps,
max_workers=max_workers,
)
[docs]def is_all_close(v1: np.ndarray, v2: np.ndarray, eps: float = 1e-2) -> bool:
"""Return whether the two vectors are approximately equal."""
scale = np.maximum(np.maximum(np.abs(v1), np.abs(v2)), 1.0)
return bool(np.less_equal((np.abs(v1 - v2)), eps * scale).all())
@dataclass
class FDParams:
x0: NDArrayFloat
shape: Tuple[int]
inner_steps: int
coeff: List[List[float]]
coeff2: List[List[float]]
fm: Callable
fm_args: Sequence[Any]
fm_kwargs: Dict[str, Any]
dd_val: float
accuracy: int
eps: float
def approximate_elt_jac(elt_id: int, fd_params: FDParams) -> NDArrayFloat:
_res = []
for s in range(fd_params.inner_steps):
fd_params.x0[elt_id] += fd_params.coeff2[fd_params.accuracy][s] * fd_params.eps
_res.append(
fd_params.coeff[fd_params.accuracy][s]
* fd_params.fm(
fd_params.x0.reshape(fd_params.shape),
*fd_params.fm_args,
**fd_params.fm_kwargs,
)
)
fd_params.x0[elt_id] -= fd_params.coeff2[fd_params.accuracy][s] * fd_params.eps
return np.sum(_res, axis=0) / fd_params.dd_val
[docs]def finite_jacobian(
x: np.ndarray,
fm: Callable,
fm_args: Optional[Sequence[Any]] = None,
fm_kwargs: Optional[Dict[str, Any]] = None,
accuracy: int = 0,
eps: Optional[float] = None,
max_workers: int = 1,
) -> NDArrayFloat:
r"""
Compute the Jacobian by finite difference.
The Jacobian is computed by using Taylor Series. For instance, if
accurcy = 1, we use 4 points, which means that
we take the Taylor series of $f$ around $a = x_j$ and compute the series
at $x = x_{j-2}, x_{j-1}, x_{j+1}, x_{j+2}$.
.. math::
\begin{eqnarray*}
f(x_{j-2}) &=& f(x_j) - 2hf^{\prime}(x_j) + \frac{4h^2f''(x_j)}{2} -
\frac{8h^3f'''(x_j)}{6} + \frac{16h^4f''''(x_j)}{24}
- \frac{32h^5f'''''(x_j)}{120} + \cdots\\
f(x_{j-1}) &=& f(x_j) - hf^{\prime}(x_j) + \frac{h^2f''(x_j)}{2}
- \frac{h^3f'''(x_j)}{6} + \frac{h^4f''''(x_j)}{24}
- \frac{h^5f'''''(x_j)}{120} + \cdots\\
f(x_{j+1}) &=& f(x_j) + hf^{\prime}(x_j) + \frac{h^2f''(x_j)}{2}
+ \frac{h^3f'''(x_j)}{6} + \frac{h^4f''''(x_j)}{24}
+ \frac{h^5f'''''(x_j)}{120} + \cdots\\
f(x_{j+2}) &=& f(x_j) + 2hf^{\prime}(x_j) + \frac{4h^2f''(x_j)}{2}
+ \frac{8h^3f'''(x_j)}{6} + \frac{16h^4f''''(x_j)}{24}
+ \frac{32h^5f'''''(x_j)}{120} + \cdots
\end{eqnarray*}
To get the $h^2, h^3$, and $h^4$ terms to cancel out, we can compute
.. math::
f(x_{j-2}) - 8f(x_{j-1}) + 8f(x_{j-1}) - f(x_{j+2})
= 12hf^{\prime}(x_j) - \frac{48h^5f'''''(x_j)}{120}
which can be rearranged to
.. math::
f^{\prime}(x_j) = \frac{f(x_{j-2}) - 8f(x_{j-1})
+ 8f(x_{j-1}) - f(x_{j+2})}{12h} + O(h^4).
This formula is a better approximation for the derivative at $x_j$
than the central difference formula, but requires twice as many
calculations.
Parameters
----------
x : np.ndarray
The input parameters array.
fm : Callable
Forward model.
fm_args: Tuple[Any]
Positional arguments for the forward model.
fm_kwargs : Dict[Any, Any]
Keyword arguments for the forward model.
accuracy : int, optional
Number of points to use for the finite difference approximation.
Possible values are 0 (2 points), 1 (4 points), 2 (6 points),
3 (8 points). The default is 0 which corresponds to the central
difference scheme (2 points).
eps: float, optional
The epsilon for the computation (h). By default, it take 1e-6 times
the maximum absolute value of the input data.
.. :math:
\delta s = \begin{cases}
\epsilon^{1/3} min\Big(max\left(\lvert s\rvert\right),
1\Big) & \text{if} \; max\left(\lvert s\rvert\right) > 0\\
\epsilon^{1/3} & \text{otherwise}
\end{cases}
max_workers: int
Number of workers used. If different from one, the calculation relies on
multi-processing to decrease the computation time. The default is 1.
Returns
-------
NDArrayFloat
The finite difference Jacobian matrix.
"""
x0 = np.array(x).astype(np.float64)
if eps is None:
_eps = np.power(np.finfo(float).eps, 1 / 3) * min(1, np.max(np.abs(x0)))
if _eps == 0:
_eps = np.power(np.finfo(float).eps, 1 / 3)
else:
_eps = eps
logging.getLogger("Finite differences").info(f"eps = {_eps:.2e}")
if accuracy not in [0, 1, 2, 3]:
raise ValueError("The accuracy should be 0, 1, 2 or 3!")
dd = np.array([2.0, 12.0, 60.0, 840.0])
fd_params = FDParams(
x0=x0.ravel(),
shape=x0.shape,
inner_steps=2 * (accuracy + 1),
coeff=[
[1.0, -1.0],
[1, -8.0, 8.0, -1.0],
[-1, 9, -45.0, 45.0, -9.0, 1.0],
[3.0, -32.0, 168.0, -672.0, 672.0, -168.0, 32.0, -3.0],
],
coeff2=[
[1.0, -1.0],
[-2.0, -1.0, 1.0, 2.0],
[-3.0, -2.0, -1.0, 1.0, 2.0, 3.0],
[-4, -3.0, -2.0, -1.0, 1.0, 2.0, 3.0, 4.0],
],
fm=fm,
fm_args=fm_args if fm_args is not None else [],
fm_kwargs=fm_kwargs if fm_kwargs is not None else {},
dd_val=dd[accuracy] * _eps,
accuracy=accuracy,
eps=_eps,
)
def get_fd_params() -> Generator:
while True:
yield fd_params
results: List[NDArrayFloat] = []
# Single worker (no multi-processing)
if max_workers == 1:
for i in range(x0.size):
results.append(approximate_elt_jac(i, fd_params))
# Multi-processing enabled
else:
with ProcessPoolExecutor(max_workers=max_workers) as executor:
results = list(
executor.map(
approximate_elt_jac,
range(x0.size),
get_fd_params(),
)
)
# (*output shape, *input shape) -> we do not flatten
return np.stack(results, axis=-1).reshape(*results[0].shape, *fd_params.shape)
[docs]def finite_gradient(
x: np.ndarray,
fm: Callable,
fm_args: Optional[Sequence[Any]] = None,
fm_kwargs: Optional[Dict[str, Any]] = None,
accuracy: int = 0,
eps: Optional[float] = None,
max_workers: int = 1,
) -> NDArrayFloat:
r"""
Compute the gradient by finite difference.
The gradient is computed by using Taylor Series. For instance, if
accurcy = 1, we use 4 points, which means that
we take the Taylor series of $f$ around $a = x_j$ and compute the series
at $x = x_{j-2}, x_{j-1}, x_{j+1}, x_{j+2}$.
.. math::
\begin{eqnarray*}
f(x_{j-2}) &=& f(x_j) - 2hf^{\prime}(x_j) + \frac{4h^2f''(x_j)}{2} -
\frac{8h^3f'''(x_j)}{6} + \frac{16h^4f''''(x_j)}{24}
- \frac{32h^5f'''''(x_j)}{120} + \cdots\\
f(x_{j-1}) &=& f(x_j) - hf^{\prime}(x_j) + \frac{h^2f''(x_j)}{2}
- \frac{h^3f'''(x_j)}{6} + \frac{h^4f''''(x_j)}{24}
- \frac{h^5f'''''(x_j)}{120} + \cdots\\
f(x_{j+1}) &=& f(x_j) + hf^{\prime}(x_j) + \frac{h^2f''(x_j)}{2}
+ \frac{h^3f'''(x_j)}{6} + \frac{h^4f''''(x_j)}{24}
+ \frac{h^5f'''''(x_j)}{120} + \cdots\\
f(x_{j+2}) &=& f(x_j) + 2hf^{\prime}(x_j) + \frac{4h^2f''(x_j)}{2}
+ \frac{8h^3f'''(x_j)}{6} + \frac{16h^4f''''(x_j)}{24}
+ \frac{32h^5f'''''(x_j)}{120} + \cdots
\end{eqnarray*}
To get the $h^2, h^3$, and $h^4$ terms to cancel out, we can compute
.. math::
f(x_{j-2}) - 8f(x_{j-1}) + 8f(x_{j-1}) - f(x_{j+2})
= 12hf^{\prime}(x_j) - \frac{48h^5f'''''(x_j)}{120}
which can be rearranged to
.. math::
f^{\prime}(x_j) = \frac{f(x_{j-2}) - 8f(x_{j-1})
+ 8f(x_{j-1}) - f(x_{j+2})}{12h} + O(h^4).
This formula is a better approximation for the derivative at $x_j$
than the central difference formula, but requires twice as many
calculations.
Parameters
----------
x : np.ndarray
The input parameters array.
fm : Callable
Forward model.
fm_args: Tuple[Any]
Positional arguments for the forward model.
fm_kwargs : Dict[Any, Any]
Keyword arguments for the forward model.
accuracy : int, optional
Number of points to use for the finite difference approximation.
Possible values are 0 (2 points), 1 (4 points), 2 (6 points),
3 (4 points). The default is 0 which corresponds to the central
difference scheme (2 points).
eps: float, optional
The epsilon for the computation (h). By default, it take 1e-6 times
the maximum absolute value of the input data.
.. :math:
\delta s = \begin{cases}
\epsilon^{1/3} min\Big(max\left(\lvert s\rvert\right),
1\Big) & \text{if} \; max\left(\lvert s\rvert\right) > 0\\
\epsilon^{1/3} & \text{otherwise}
\end{cases}
max_workers: int
Number of workers used. If different from one, the calculation relies on
multi-processing to decrease the computation time. The default is 1.
Returns
-------
NDArrayFloat
The finite difference gradient vector.
"""
return finite_jacobian(
x, fm, fm_args, fm_kwargs, accuracy, eps, max_workers
).reshape(x.shape)