Prox Repository

This tutorial describes how to solve an optimization problem with the forward-backward primal-dual algorithm (FBPD), which is able to $$ \operatorname*{minimize}_{x\in\mathbb{R}^N} \; f(x) + g(x) + h(Lx), $$ where

$f \in \Gamma_0(\mathbb{R}^N)$,
$g \in \Gamma_0(\mathbb{R}^N)$ with $\beta$-Lipschitz continuous gradient for some $\beta>0$,
$L \in \mathbb{R}^{M\times N}$,
$h\in \Gamma_0(\mathbb{R}^M)$.

Under some technical assumptions, it can be shown that the sequence $(x_i)_{i\in\mathbb{N}}$ generated by $$ \left\lfloor \begin{aligned} x_{i+1} &= \operatorname{prox}_{\tau \, f} \Big(x_i - \tau \big(\nabla g(x_i) + L^\top y_i\big)\Big)\\[0.5em] y_{i+1} &= \operatorname{prox}_{\sigma \, h^*} \Big( y_i+\sigma \, L\big(2x_{i+1}-x_i\big)\Big) \end{aligned} \right. $$ converges to a solution to the above problem, provided that $\tau>0$ and $\sigma>0$ are such that $$\tau \left(\frac{\beta}{2} + \sigma \|L^\top L\|\right) < 1.$$

A generic implementation of FBPD is given below (the function inputs will be clarified later).


function [x, it, time, crit] = FBPD(x, f, g, h, opt)

% default inputs
if nargin < 2 || isempty(f), f.fun = @(x) 0; f.prox = @(x,gamma) x;                                                   end
if nargin < 3 || isempty(g), g.fun = @(x) 0; g.grad = @(x)       0; g.beta   = 0;                                     end
if nargin < 4 || isempty(h), h.fun = @(y) 0; h.prox = @(y,gamma) y; h.dir_op = @(x) x; h.adj_op = @(y) y; h.beta = 1; end
if nargin < 5 || isempty(opt), opt.tol = 1e-4; opt.iter = 500;                                                        end

% select the step-sizes
tau = 2 / (g.beta+2);
sigma = (1/tau - g.beta/2) / h.beta;

% initialize the dual solution
y = h.dir_op(x);

% execute the algorithm
time = zeros(1, opt.iter);
crit = zeros(1, opt.iter);
hdl = waitbar(0, 'Running FBPD...');
for it = 1:opt.iter
    
    tic;
    
    % primal forward-backward step
    x_old = x;
    x = x - tau * ( g.grad(x) + h.adj_op(y) );
    x = f.prox(x, tau);
    
    % dual forward-backward step
    y = y + sigma * h.dir_op(2*x - x_old);
    y = y - sigma * h.prox(y/sigma, 1/sigma);   

    % time and criterion
    time(it) = toc;
    crit(it) = f.fun(x) + g.fun(x) + h.fun(h.dir_op(x));
           
    % stopping rule
    if norm( x(:) - x_old(:) ) < opt.tol * norm( x_old(:) ) && it > 10
        break;
    end
    
    waitbar(it/opt.iter, hdl);
end

close(hdl);
crit = crit(1:it);
time = cumsum(time(1:it));


import numpy as np
import time as time

# Default inputs of the FBPD function
class EmptyFunction:
    def fun(x):      return 0
    def grad(x):     return 0
    def prox(x,tau): return x
    def dir_op(x):   return x
    def adj_op(x):   return x

# Default algorithmic parameters: 'tol' and maximum number of iterations
opt_= {'tol': 1e-4, 'iter': 500}

def FBPD(x_init, f=EmptyFunction(), g=EmptyFunction(), h=EmptyFunction(), opt=opt_):

    # algorithmic parameters
    tol      = opt['tol']
    max_iter = opt['iter']
    
    # step-sizes
    tau   = 2.0 / (g.beta + 2.0)
    sigma = (1.0/tau - g.beta/2.0) / h.beta

    # initialization
    x = x_init
    y = h.dir_op(x)

    print('Running FBPD...')
    
    timing = np.zeros(max_iter)
    criter = np.zeros(max_iter)

    # algorithm loop
    stop=False
    it=0
    while (it < max_iter) and (not stop):
    
        t = time.time()
    
        # primal forward-backward step
        x_old = x;
        x = x - tau * ( g.grad(x) + h.adj_op(y) );
        x = f.prox(x, tau);
    
        # dual forward-backward step
        y = y + sigma * h.dir_op(2*x - x_old);
        y = y - sigma * h.prox(y/sigma, 1/sigma);   

        # time and criterion
        timing[it] = time.time() - t
        criter[it] = f.fun(x) + g.fun(x) + h.fun(h.dir_op(x));
           
        # stopping rule
        if np.linalg.norm(x - x_old) < tol * np.linalg.norm(x_old) and it > 10:
           break
    
        print(str(it)+'out of'+str(max_iter)+'iterations')

    criter = criter[0:it+1];
    timing = np.cumsum(timing[0:it+1]);
    
    return x, it, timing, criter

Image Restoration

The goal of image restoration is to recover the visual content of a corrupted image through the inversion of the corresponding degradation process. In this context, a popular task consists of recovering an image $\overline{x} \in \mathbb{R}^N$ as close as possible to some observations $z \in \mathbb{R}^K$ generated as $$ z = A\overline{x} + b $$ where

$A \in \mathbb{R}^{K\times N}$ is a known linear degradation,
$b \in \mathbb{R}^K$ is a realization of zero-mean white Gaussian noise.

For the purpose of this tutorial, the degraded image is generated as follows, with $K=N$.


% original image
x_bar = double( imread('firemen.jpg') );

% blur operator
psf = fspecial('average', 3);

% noisy image
rng('default');
z = imfilter(x_bar, psf) + 20 * randn( size(x_bar) );

% visualization
figure; imshow(x_bar/255,[]); title('Original image');
figure; imshow(    z/255,[]); title('Noisy image');


import scipy.ndimage.filters as fil
import matplotlib.pyplot as plt
%matplotlib inline 

# original image
    
x_bar = plt.imread('firemen.jpg')
x_bar = x_bar.astype(np.float64)

# blur operator
psf = (3, 3, 1)

# noisy image
z = fil.uniform_filter(x_bar, psf) + 20 * np.random.randn(*x_bar.shape);

# visualization
plt.imshow(x_bar/255)
plt.title('Original image')
plt.figure()
plt.imshow(np.clip(z/255,0,1))
plt.title('Noisy image')

Problem Formulation

To recover $\overline{x}$ from $z$, one can follow a variational approach that aims at $$ \operatorname*{minimize}_{x\in[0,255]^N} \quad \underbrace{\frac{1}{2}\|Ax-z\|_2^2}_{\textrm{Data fidelity}} \quad + \underbrace{h(L x)}_{\textrm{Regularization}} $$ This formulation reverts to the general convex optimization problem given in the introduction by setting

$f(x) = \iota_{[0,255]^N}(x)$,
$g(x) = \frac{1}{2}\|Ax-z\|_2^2$.

As for the data fidelity term, FBPD needs the following information:

$\operatorname{prox}_{\tau \, f}(x) = P_{[0,255]^N}(x)$,
$\nabla g(x) = A^\top(Ax-z)$,
$\beta = \|A\|^2$.

This is illustrated in the code below.


% constraint
f.prox = @(x,tau) project_box(x, 0, 255);

% data fidelity
A_dir  = @(x) imfilter(x, psf);
A_adj  = @(x) imfilter(x, rot90(psf,2));  % WARNING: 'psf' must be a (2n+1)-by-(2n+1) matrix
g.grad = @(x) A_adj(A_dir(x) - z);
g.beta = sum(abs(psf(:)));

% criteria
f.fun = @(x) indicator_box(x, 0, 255);
g.fun = @(x) sum(sum(sum((A_dir(x)-z).^2)));


from proximityOperator import BoxConstraint

class LeastSquares:
    
    def __init__(self, z=None, psf=None):
        self.z    = z
        self.psf  = psf
        self.beta = np.prod(psf)
        
    def A_dir(self, x):
        return fil.uniform_filter(x, self.psf)
    
    def A_adj(self, x):
        return fil.uniform_filter(x, self.psf);     # WARNING: filter dimensions must be odd
    
    def grad(self, x):
        return self.A_adj( self.A_dir(x) - self.z )
    
    def __call__(self, x):
        p = self.A_dir(x)
        return np.sum(np.square(p-z))

# constraint
f = BoxConstraint(0, 255)

# data fidelity
g = LeastSquares(z, psf)

Regularization: Total Variation (TV)

The total variation (TV) regularization is defined as: $$ \operatorname{TV}(x) = \lambda \sum_{\ell=1}^N \left(\big|x^{(\ell)}-x^{(n_{\ell,1})}\big|^2 + \big|x^{(\ell)}-x^{(n_{\ell,2})}\big|^2\right)^{1/2} $$ where $\lambda>0$, and $(n_{\ell,1},n_{\ell,2})\in\{1,\dots,N\}^2$ denote the positions of the horizontal/vertical nearest neighbors of $x^{(\ell)}$.

This penalty can be plugged into the problem formulation by setting:

$$ y = L x = \left[ \begin{aligned} &x^{(1)}-x^{(n_{1,1})}\\ &x^{(1)}-x^{(n_{1,2})}\\ &\quad\;\; \vdots\\ &x^{(N)}-x^{(n_{N,1})}\\ &x^{(N)}-x^{(n_{N,2})}\\ \end{aligned} \right] \begin{aligned} &\left.\vphantom{ \begin{aligned} &x^{(1)}-x^{(n_{1,1})}\\ &x^{(1)}-x^{(n_{1,2})}\\ \end{aligned} }\right\} \; y_1\in\mathbb{R}^2\\ &\qquad\vdots\\ &\left.\vphantom{ \begin{aligned} &x^{(N)}-x^{(n_{N,1})}\\ &x^{(N)}-x^{(n_{N,2})}\\ \end{aligned} }\right\} \; y_N\in\mathbb{R}^2\\ \end{aligned} $$ and $$ h(y) = \sum_{\ell=1}^N \lambda \|y_\ell\|_2. $$ Consequently, FBPD needs the implementation of

$L$ and $L^\top$,
$\|L^\top L\|=8$,
$\operatorname{prox}_{\gamma \, h}(y) = \big[ \operatorname{prox}_{\gamma \, \lambda\|\cdot\|_2}(y_\ell)\big]_{1\le\ell\le N}.$

For the implementation of $\operatorname{prox}_h$, note that h.dir_op generates a 4D matrix in which the gradient vector of each pixel is stored along the 4th dimension.


% forward finite differences (with Neumann boundary conditions)
hor_forw = @(x) [x(:,2:end,:)-x(:,1:end-1,:), zeros(size(x,1),1,size(x,3))]; % horizontal
ver_forw = @(x) [x(2:end,:,:)-x(1:end-1,:,:); zeros(1,size(x,2),size(x,3))]; % vertical

% backward finite differences (with Neumann boundary conditions)
hor_back = @(x) [-x(:,1,:), x(:,1:end-2,:)-x(:,2:end-1,:), x(:,end-1,:)];    % horizontal
ver_back = @(x) [-x(1,:,:); x(1:end-2,:,:)-x(2:end-1,:,:); x(end-1,:,:)];    % vertical

% direct and adjoint operators
h.dir_op = @(x) cat( 4, hor_forw(x), ver_forw(x) );
h.adj_op = @(y) hor_back( y(:,:,:,1) ) + ver_back( y(:,:,:,2) );

% operator norm
h.beta = 8;

% regularization parameter
lambda = 5;

% proximity operator
h.prox = @(y,gamma) prox_L2(y, gamma*lambda, 4);

% criterion
h.fun = @(y) fun_L2(y, lambda, 4);


from proximityOperator import L2Norm

def hor_forward(x):
    """ Horizontal forward finite differences (with Neumann boundary conditions) """
    hor = np.zeros_like(x)
    hor[:,:-1,:] = x[:,1:,:] - x[:,:-1,:]
    return hor

def ver_forward(x):
    """ Vertical forward finite differences (with Neumann boundary conditions) """
    ver = np.zeros_like(x)
    ver[:-1,:,:] = x[1:,:,:] - x[:-1,:,:]
    return ver

def hor_backward(x):
    """ Horizontal backward finite differences (with Neumann boundary conditions) """
    Nr, Nc, Nb = x.shape
    zer = np.zeros((Nr,1,Nb))
    xxx = x[:,:-1,:]
    return np.concatenate((zer,xxx), 1) - np.concatenate((xxx,zer), 1)

def ver_backward(x):
    """ Vertical backward finite differences (with Neumann boundary conditions) """
    Nr, Nc, Nb = x.shape
    zer = np.zeros((1,Nc,Nb))
    xxx = x[:-1,:,:]
    return np.concatenate((zer,xxx), 0) - np.concatenate((xxx,zer), 0)


class TotalVariation(L2Norm):

    beta = 8
    
    def __init__(self, gamma):
        L2Norm.__init__(self, gamma, direction=3)
    
    def dir_op(self, x):
        h = hor_forward(x)
        v = ver_forward(x)
        return np.stack((h,v), 3)
    
    def adj_op(self, y):
        h = hor_backward( y[:,:,:,0] )
        v = ver_backward( y[:,:,:,1] )
        return h + v
    
    #def fun(self, x)  --> inherited from L2_Norm
    #def prox(self, x) --> inherited from L2_Norm

# hyperparameter
gamma = 5

# regularization
h = TotalVariation(gamma)

Execution

The information needed by FBPD is now complete, and the optimization method can be executed.


% minimization
[x, it, time, crit] = FBPD(z, f, g, h);

% PSRN
psnr = 10 * log10( 255^2 / mean((x(:)-x_bar(:)).^2) );

% visualization
figure; imshow(x/255,[]); title(['Restored image - PSNR: ' num2str(round(psnr,2))])
figure; plot(time, crit); title('Convergence plot'); xlabel('seconds'); ylabel('criterion')


# minimization
x, it, time_, crit = FBPD(z, f, g, h)

# PSNR
psnr = 10 * np.log10( 255*255 / np.mean(np.square(x-x_bar)) )

# visualization
plt.imshow(x/255); 
plt.title( 'Restored image - PSNR: ' + str(np.round(psnr,2)) )
plt.figure()
plt.plot(time_, crit) 
plt.title('Convergence plot')
plt.xlabel('seconds')
plt.ylabel('criterion')

Regularization: Structure Tensor (ST)

In the previous example, the regularization is applied separately to each color channel of the image to be restored. The *structure tensor (ST)* is a natural extension of TV that allows one to process the channels jointly. Assume that the sought image $x$ is composed by three channels (red, blue, and greed): $$ x = \begin{bmatrix} x_1\\ x_2\\ x_3 \end{bmatrix} $$ with $x_j\in\mathbb{R}^Q$ for every $j\in\{1,2,3\}$, and $N=3Q$. The ST regularization is defined as: $$ \operatorname{ST}(x) = \lambda \sum_{\ell=1}^Q \left\|\begin{bmatrix} x_1^{(\ell)}-x_1^{(n_{\ell,1})} & x_2^{(\ell)}-x_2^{(n_{\ell,1})} & x_3^{(\ell)}-x_3^{(n_{\ell,1})} \\ x_1^{(\ell)}-x_1^{(n_{\ell,2})} & x_2^{(\ell)}-x_2^{(n_{\ell,2})} & x_3^{(\ell)}-x_3^{(n_{\ell,2})} \end{bmatrix}\right\|_* $$ where $\lambda>0$ and $\|\cdot\|_*$ denotes the nuclear norm. This penalty can be plugged into [Problem formulation](#Problem-formulation) by setting: $$ Y = F x = \left[ \begin{aligned} &x_1^{(1)}-x_1^{(n_{1,1})} &&x_2^{(1)}-x_2^{(n_{1,1})} &&x_3^{(1)}-x_3^{(n_{1,1})} \\ &x_1^{(1)}-x_1^{(n_{1,2})} &&x_2^{(1)}-x_2^{(n_{1,2})} &&x_3^{(1)}-x_3^{(n_{1,2})} \\ &\quad\;\; \vdots &&\quad\;\; \vdots&&\quad\;\; \vdots\\ &x_1^{(Q)}-x_1^{(n_{Q,1})} &&x_2^{(Q)}-x_2^{(n_{Q,1})} &&x_3^{(Q)}-x_3^{(n_{Q,1})} \\ &x_1^{(Q)}-x_1^{(n_{Q,2})} &&x_2^{(Q)}-x_2^{(n_{Q,2})} &&x_3^{(Q)}-x_3^{(n_{Q,2})} \\ \end{aligned} \right] \begin{aligned} &\left.\vphantom{ \begin{aligned} &x^{(1)}-x^{(n_{1,1})}\\ &x^{(1)}-x^{(n_{1,2})}\\ \end{aligned} }\right\} \; Y_1\in\mathbb{R}^{2\times3}\\ &\qquad\vdots\\ &\left.\vphantom{ \begin{aligned} &x^{(Q)}-x^{(n_{Q,1})}\\ &x^{(Q)}-x^{(n_{Q,2})}\\ \end{aligned} }\right\} \; Y_Q\in\mathbb{R}^{2\times3}\\ \end{aligned} \\ \\ \textrm{and}\qquad h(Y) = \sum_{\ell=1}^Q \lambda \|Y_\ell\|_*. $$ Consequently, FBPD needs the following information: - $F$ and $F^\top$ - $\|F^\top F\| = 8$ - $\operatorname{prox}_{\gamma \, h}(Y) = \Big(\operatorname{prox}_{\gamma \, \lambda\|\cdot\|_*}(Y_\ell)\Big)_{1\le\ell\le Q}$
Let's start with the implementation of $F$ and its adjoint $F^\top$.


from proxop import NuclearNorm 

class StructureTensor(NuclearNorm):
    def __init__(self, lam):
        self.beta = 8
        self.lam = lam  # hyperparameter lambda
        super().__init__()
    
    def prox(self, x, gamma):
        return super().prox(x, self.lam*gamma)
    
    def __call__(self, x):
        return self.lam*super().__call__(x)
    
    # utility functions
    def _pack(self,x):
        sz=np.shape(x)
        return np.reshape( np.transpose(x, (2,0,1)), (1, sz[2], sz[0], sz[1]) )

    def _unpack(self,y,i):
        sz=np.shape(y)
        #np.transpose( np.reshape(y[i,...], (sz[1], sz[2], sz[3])), (1,2,0))
        return np.transpose( y[i,...], (1,2,0) )
    
    def dir_op(self, x):
        h = self._pack(hor_forward(x))
        v = self._pack(ver_forward(x))
        return np.concatenate((h,v), 0)
    
    def adj_op(self, y):
        h = hor_backward( self._unpack( y, 0) )
        v = ver_backward( self._unpack( y, 1) )
        return h + v

Let's continue with the implementation of proxℎ .
Note that h.dir_op generates a 4D matrix in which the gradient matrix of each position is stored on the 1st and 2nd dimensions.


# regularization parameter
lambda_ = 7
h=StructureTensor(lam=lambda_)

The information needed by FBPD is now complete, and the optimization method can be executed.


# minimization
opt_= {'tol': 1e-4, 'iter': 100}
x, it, time_, crit = FBPD(z, f, g, h, opt_)

# PSRN
psnr = 10 * np.log10( 255**2 / np.mean((x-x_bar)**2))

# visualization
plt.imshow(x/255);
plt.title( 'Restored image - PSNR: ' + str(np.round(psnr,2)) )
plt.figure()
plt.plot(time_, crit) 
plt.title('Convergence plot')
plt.xlabel('seconds')
plt.ylabel('criterion')
plt.savefig("convergence_ST.png")

Tutorial: Image Restoration

Introduction

Image Restoration

Problem Formulation

Regularization: Total Variation (TV)

Execution

Regularization: Structure Tensor (ST)

Total Variation (TV) vs Structure Tensor (ST)