/***********************************************************************

    This file is part of
    FEINS, Finite Element Incompressible Navier-Stokes solver,
    which is expanding to a more general FEM solver and toolbox,
    Copyright (C) 2003--2013, Rene Schneider 
    <rene.schneider@mathematik.tu-chemnitz.de>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program. If not, see <http://www.gnu.org/licenses/>.

    Minor contributions to this program (for example bug-fixes and
    minor extensions) by third parties automatically transfer the
    copyright to the general author of FEINS, to maintain the
    possibility of commercial re-licensing. If you contribute but wish
    to keep the copyright of your contribution, make that clear in
    your contribution!

    Non-GPL licenses to this program are available upon request from
    the author.

************************************************************************/
/*
FILE lin_solver.c
HEADER lin_solver.h

TO_HEADER:


// useful text macros
#include "feins_macros.h"
// data structures 
#include "datastruc.h"
#include "sparse_struct.h"

*/




/* prototypes of external functions */
#include <math.h>
#include "sparse.h"
#include "feins_lapack.h"

/* function prototypes */
#include "lin_solver.h"


/*FUNCTION*/
int PCG( int it_max, int stoptype, double atol, double rtol, int use_initial_x,
	 struct vector *x, double *residual, int *iter,
	 int (*multfunc)(void *Mat, struct vector *vec,
			 struct vector *out), 
	 int (*precondi)(void *Mat, struct vector *vec, void *prec_dat,
			 struct vector *out),
	 void *Mat, struct vector *rhs, void *prec_dat
/* Preconditioned Conjugate Gradient solver for systems of linear
   equations with symmetric positive definite system matrices, 
   approximates a solution of 

          Mat*x = rhs

   Input:  it_max   - maximal number of PCG iterations
	   stoptype - type of the stopping criterion:
                      0 - no criterion, just do it_max iterations,
		      1 - absolute, stop when norm(residual)<atol,
		      2 - relative to the initial residual, stop when 
		          norm(residual) < rtol*norm(residual_0)
		      3 - absolute and relative, stop if either the
		          relative or absolute stopping criterion of
		          above is fulfilled
		      other result in FAIL return + error message
           atol     - value of the absolute tolerance for the stopping
                      criteria 1 and 3, see stoptype above
           rtol     - value of the relative tolerance for the stopping
                      criteria 2 and 3, see stoptype above
           use_initial_x
                    - states if the provided x shall be used as
                      initial iterate x_0 or not,
		      = 1 : use it,
		      = 0 : don't use it, in this case the zero vector
		            is used instead
	   multfunc - pointer to a function performing the matrix
                      vector multiplication Mat*vec, syntax:
		      int multfunc(void *Mat, struct vector *vec, 
		                   struct vector *out),
		      where Mat is the pointer provided in the call to
		      PCG and vec is a vector of the same type as the
		      solution vector x, 
		      must return SUCCESS on success and FAIL in case
		      of error 
	   precondi - pointer to a function performing the
                      preconditioning operation out=C^{-1}*vec,
		      ideally C should be spectral equivalent to the
		      system matrix Mat, e.g. C^{-1} can be a
		      cheap approximation of the inverse of Mat,
		      the quality of the preconditioner has huge
		      influence on the performance of PCG,
		      syntax:
		      int precondi(void *Mat, struct vector *vec,
		                   void *prec_dat, struct vector *out),
		      where Mat as provided to PCG, vec a vector of
		      same type as solution vector x, prec_dat as
		      provided to PCG, must return SUCCESS on success
		      and FAIL in case of error,
		      if precondi==NULL then the function is not
		      called but C^{-1}=I is used (no preconditioning)
           Mat      - pointer to data necessary to perform the
                      multiplication with the matrix, 
		      in general this should be some representation of
		      the matrix itself, but it is not used in PCG
		      directly, but only through multfunc and precondi
	   rhs      - the right-hand side vector of the equation system
	   prec_dat - pointer to additional data for the
	              preconditioner, not used in PCG itself, but only
	              in precondi

   In/Out: x        - on Input:  initial iterate, if use_initial_x==1,
                                 else not used as input,
                      on Output: approximate solution of the system

   Output: residual - norm of the residual of the final iterate,
                      provided by reference, not used if NULL pointer
                      is provided
           iter     - number of the last iteration,
                      provided by reference, not used if NULL pointer
                      is provided

   Return: SUCCESS  - stopping criterion is fulfilled
           FAIL     - general failure, see error messages
	   10       - itermax reached, but stopping criterion not fulfilled
*/
	 ){
  struct vector res, preco, search;
  double alpha, beta, gamma, gamma_new, delta, resnorm, iniresnorm;
  int it, check;
  FIDX i, len;
  int err;

  len=(*x).len;

  /* malloc of datastructures */
  err=vector_alloc( &res, len );
  if (err!=SUCCESS)
    {
      fprintf(stderr,
	      "failure during init of PCG, vector_alloc for res\n");
      return FAIL;
    }
  err=vector_alloc( &preco, len );
  if (err!=SUCCESS)
    {
      fprintf(stderr,
	      "failure during init of PCG, vector_alloc for preco\n");
      vector_free(&res);
      return FAIL;
    }
  err=vector_alloc( &search, len );
  if (err!=SUCCESS)
    {
      fprintf(stderr,
	      "failure during init of PCG, vector_alloc for search\n");
      vector_free(&res);
      vector_free(&preco);
      return FAIL;
    }
  /* save some typing */
#define MULTFUNCERRHANDLE {if (err!=SUCCESS) \
	{ fprintf(stderr, "failure of multfunc in PCG\n"); \
	  vector_free(&res); \
	  vector_free(&preco); \
	  vector_free(&search); \
	  return FAIL; \
	} \
      }
#define PRECONDIERRHANDLE {if (err!=SUCCESS) \
	{ fprintf(stderr, "failure of precondi in PCG\n"); \
	  vector_free(&res); \
	  vector_free(&preco); \
	  vector_free(&search); \
	  return FAIL; \
	} \
      }
	   
  /* initialisation of iteration */
     /* iteration counter */
  it=0; 
    /* residual */
  if (use_initial_x==1)
    {
      /* res = Mat*x */
      err= (*multfunc)( Mat, x, &res);
      MULTFUNCERRHANDLE;
      /* res = Mat*x-rhs */
      for (i=0; i<len; i++)
	res.V[i]-=(*rhs).V[i];
    }
  else
    {
      /* x=0, res=-rhs */
      for (i=0; i<len; i++)
	{
	  res.V[i]=-(*rhs).V[i];
	  (*x).V[i]=0;
	}
    }
    /* preconditioned residual, search, gamma */
  if (precondi!=NULL)
    {
      /* preco=C^{-1} * res */
      err= (*precondi)(Mat, &res, prec_dat, &preco);
      PRECONDIERRHANDLE;
      gamma=0.0;
      resnorm=0.0;
      for (i=0; i<len; i++)
	{
	  /* search  = preco */
	  search.V[i]= preco.V[i];
	  /* gamma   = preco^T res */
	  gamma     += preco.V[i] * res.V[i];
	  /* resnorm = res^T res */
	  resnorm   += preco.V[i]   * preco.V[i];
	}
      resnorm=sqrt(resnorm);
    }
  else
    {
      gamma=0.0;
      for (i=0; i<len; i++)
	{
	  /* search=res */
	  search.V[i]=res.V[i];
	  /* gamma = res^T res */
	  gamma += res.V[i] * res.V[i];
	}
      resnorm=sqrt(gamma);
    }

  iniresnorm=resnorm;

  /* two times identical code, so re define it as macro ;o) */
#define LOOPCHECK { \
  /*printf("PCG:  it=%3d   |res|=%7.2e \n",it, resnorm); /* */	\
  if (residual != NULL) \
    { *residual = resnorm; } \
  if (iter     != NULL) \
    { *iter     = it; } \
  if (stoptype==0) \
    { check=1; } \
  else if (stoptype==1) \
    { check=(resnorm>atol); } \
  else if (stoptype==2) \
    { check=(resnorm>rtol*iniresnorm); } \
  else if (stoptype==3) \
    { check=((resnorm>atol)&&(resnorm>rtol*iniresnorm)); }	\
  else \
    { fprintf(stderr, "unknown stoptype=%d in PCG !\n", stoptype); \
      vector_free(&res); \
      vector_free(&preco); \
      vector_free(&search); \
      return FAIL; \
    } \
  }
  
  
  LOOPCHECK;

  /* iteration loop */
  while ((check)&&(it<it_max))
    {
      it++;
      
      /* preco=help= Mat* search */
      err= (*multfunc)( Mat, &search, &preco );
      MULTFUNCERRHANDLE;

      /* delta = help^T search */
      delta = 0.0;
      for (i=0; i<len; i++)
	{
	  delta += preco.V[i] * search.V[i];
	}

      alpha = -gamma/delta;
      for (i=0; i<len; i++)
	{
	  /* x       = x   + alpha * search */
	  (*x).V[i] += alpha * search.V[i];
	  
	  /* res     = res + alpha * help   */
	  res.V[i]  += alpha * preco.V[i];
	}


      /* get new search */
      /* preconditioner */
      if (precondi!=NULL)
	{
	  /* preco=C^{-1} * res */
	  err= (*precondi)(Mat, &res, prec_dat, &preco);
	  PRECONDIERRHANDLE;
	  
	  /* get residual norm and gamma_new */
	  resnorm=0.0;
	  gamma_new=0.0;
	  for (i=0; i<len; i++)
	    {
	      /* gamma_new = preco^T res */
	      gamma_new += preco.V[i] * res.V[i];
	      /* resnorm = preco^T preco */
	      resnorm   += preco.V[i] * preco.V[i];
	    }
	  resnorm=sqrt(resnorm);
	      
	  beta = gamma_new / gamma;
	  
	  /* new search direction 
	     search = precon + beta * search */
	  for (i=0; i<len; i++)
	    {
	      search.V[i] = preco.V[i] + beta * search.V[i];
	    }
	}
      else
	{
	  
	  /* get residual norm and gamma_new */
	  resnorm=0.0;
	  for (i=0; i<len; i++)
	    {
	      /* resnorm = res^T res */
	      resnorm   += res.V[i] * res.V[i];
	    }
	  gamma_new = resnorm;
	  resnorm=sqrt(resnorm);
	  
	  beta = gamma_new / gamma;

	  /* new search direction 
	     search = res + beta * search */
	  for (i=0; i<len; i++)
	    {
	      search.V[i] = res.V[i] + beta * search.V[i];
	    }
	}
      gamma = gamma_new;
      
      LOOPCHECK;
    }

  if (check)
    {
      /* goal not met yet => failure */
      check=10;
    }
  else
    {
      /* goal me => SUCCESS */
      check=SUCCESS;
    }

  /* freeing of datastructures */
  vector_free(&res);
  vector_free(&preco);
  vector_free(&search);

#undef LOOPCHECK  
#undef MULTFUNCERRHANDLE
#undef PRECONDIERRHANDLE

  return check;
}




/*FUNCTION*/
int PCR( int it_max, int stoptype, double atol, double rtol, int use_initial_x,
	 struct vector *x, double *residual, int *iter,
	 int (*multfunc)(void *Mat, struct vector *vec,
			 struct vector *out), 
	 int (*precondi)(void *Mat, struct vector *vec, void *prec_dat,
			 struct vector *out),
	 void *Mat, struct vector *rhs, void *prec_dat
/* Preconditioned Conjugate Residual solver for systems of linear
   equations with symmetric (maybe indefinite) system matrices, 
   approximates a solution of 

          Mat*x = rhs

   Input:  it_max   - maximal number of PCR iterations
	   stoptype - type of the stopping criterion:
                      0 - no criterion, just do it_max iterations,
		      1 - absolute, stop when norm(residual)<atol,
		      2 - relative to the initial residual, stop when 
		          norm(residual) < rtol*norm(residual_0)
		      3 - absolute and relative, stop if either the
		          relative or absolute stopping criterion of
		          above is fulfilled
		      other result in FAIL return + error message
           atol     - value of the absolute tolerance for the stopping
                      criteria 1 and 3, see stoptype above
           rtol     - value of the relative tolerance for the stopping
                      criteria 2 and 3, see stoptype above
           use_initial_x
                    - states if the provided x shall be used as
                      initial iterate x_0 or not,
		      = 1 : use it,
		      = 0 : don't use it, in this case the zero vector
		            is used instead
	   multfunc - pointer to a function performing the matrix
                      vector multiplication Mat*vec, syntax:
		      int multfunc(void *Mat, struct vector *vec, 
		                   struct vector *out),
		      where Mat is the pointer provided in the call to
		      PCR and vec is a vector of the same type as the
		      solution vector x, 
		      must return SUCCESS on success and FAIL in case
		      of error 
	   precondi - pointer to a function performing the
                      preconditioning operation out=C^{-1}*vec,
		      ideally C should be spectral equivalent to the
		      system matrix Mat, e.g. C^{-1} can be a
		      cheap approximation of the inverse of Mat,
		      the quality of the preconditioner has huge
		      influence on the performance of PCR,
		      syntax:
		      int precondi(void *Mat, struct vector *vec,
		                   void *prec_dat, struct vector *out),
		      where Mat as provided to PCR, vec a vector of
		      same type as solution vector x, prec_dat as
		      provided to PCR, must return SUCCESS on success
		      and FAIL in case of error,
		      if precondi==NULL then the function is not
		      called but C^{-1}=I is used (no preconditioning)
           Mat      - pointer to data necessary to perform the
                      multiplication with the matrix, 
		      in general this should be some representation of
		      the matrix itself, but it is not used in PCR
		      directly, but only through multfunc and precondi
	   rhs      - the right-hand side vector of the equation system
	   prec_dat - pointer to additional data for the
	              preconditioner, not used in PCR itself, but only
	              in precondi

   In/Out: x        - on Input:  initial iterate, if use_initial_x==1,
                                 else not used as input,
                      on Output: approximate solution of the system

   Output: residual - norm of the residual of the final iterate,
                      provided by reference, not used if NULL pointer
                      is provided
           iter     - number of the last iteration,
                      provided by reference, not used if NULL pointer
                      is provided

   Return: SUCCESS  - stopping criterion is fulfilled
           FAIL     - general failure, see error messages
	   10       - itermax reached, but stopping criterion not fulfilled
*/
	 ){
  struct vector res, p, search, w, y, z;
  double alpha, beta, rho, resnorm, iniresnorm;
  int it, check;
  FIDX i, len;
  int err;

  len=(*x).len;

  /* make sure a vector_free does not result in segfaults */
  res.V=NULL;
  p.V=NULL;
  search.V=NULL;
  w.V=NULL;
  y.V=NULL;
  z.V=NULL;


  /* useful macros */
  /* free data and return failure */
#define PCRERRRET {\
      vector_free(&res);\
      vector_free(&p);\
      vector_free(&search);\
      vector_free(&w);\
      vector_free(&y);\
      vector_free(&z);\
      return FAIL; }

  /* allocate a vector */
#define PCRVECALLOC(__who__) {\
  err=vector_alloc( &__who__, len ); \
  if (err!=SUCCESS) \
    { \
      fprintf(stderr, \
	      "failure during init of PCR, vector_alloc for %s\n", \
              #__who__);\
      PCRERRRET;\
    }\
  }
  

  /* error handler for user provided functions */
#define MULTFUNCERRHANDLE {if (err!=SUCCESS) \
	{ fprintf(stderr, "failure of multfunc in PCR\n"); \
          PCRERRRET;\
	} \
      }
#define PRECONDIERRHANDLE {if (err!=SUCCESS) \
	{ fprintf(stderr, "failure of precondi in PCR\n"); \
          PCRERRRET;\
	} \
      }

  /* check for convergence */
#define LOOPCHECK { \
  /* printf("PCR: it=%3d  |res|=%10.3e\n", it, resnorm); /* */	\
  if (residual != NULL) \
    { *residual = resnorm; } \
  if (iter     != NULL) \
    { *iter     = it; } \
  if (stoptype==0) \
    { check=1; } \
  else if (stoptype==1) \
    { check=(resnorm>atol); } \
  else if (stoptype==2) \
    { check=(resnorm>rtol*iniresnorm); } \
  else if (stoptype==3) \
    { check=((resnorm>atol)&&(resnorm>rtol*iniresnorm)); }	\
  else \
    { fprintf(stderr, "unknown stoptype=%d in PCR !\n", stoptype); \
      PCRERRRET; \
    } \
  }


  /* malloc of datastructures */
  PCRVECALLOC(res);
  PCRVECALLOC(p);
  PCRVECALLOC(search);
  PCRVECALLOC(w);
  PCRVECALLOC(y);
  PCRVECALLOC(z);

	   
  /* initialisation of iteration */
  /* iteration counter */
  it=0; 
  /* residual */
  if (use_initial_x==1)
    {
      /* res = Mat*x */
      err= (*multfunc)( Mat, x, &res);
      MULTFUNCERRHANDLE;
      /* res = rhs - Mat*x */
      for (i=0; i<len; i++)
	res.V[i]=(*rhs).V[i]-res.V[i];
    }
  else
    {
      /* x = 0, res = rhs */
      for (i=0; i<len; i++)
	{
	  res.V[i]=(*rhs).V[i];
	  (*x).V[i]=0;
	}
    }

  /* preconditioned residual */
  if (precondi!=NULL)
    {
      /* p = C^{-1} * res */
      err= (*precondi)(Mat, &res, prec_dat, &p);
      PRECONDIERRHANDLE;
    }
  else
    {
      for (i=0; i<len; i++)
	{
	  /* p = res */
	  p.V[i]=res.V[i];
	}
    }

  /* w= Mat*p */
  err= (*multfunc)( Mat, &p, &w);
  MULTFUNCERRHANDLE;

  /* y=preconditioned w */
  if (precondi!=NULL)
    {
      /* y=C^{-1} * w */
      err= (*precondi)(Mat, &w, prec_dat, &y);
      PRECONDIERRHANDLE;
    }
  else 
    {
      for (i=0; i<len; i++)
	{
	  /* y = w */
	  y.V[i]=w.V[i];
	}
    }


  /* init search, compute resnorm */
  resnorm=0.0;
  for (i=0; i<len; i++)
    {
      /* search   = p */
      search.V[i] = p.V[i];
      /* resnorm  = res^T*res */
      resnorm    += p.V[i]*p.V[i];
    }
  
  resnorm=sqrt(resnorm);

  iniresnorm=resnorm;

  LOOPCHECK;

  /* iteration loop */
  while ((check)&&(it<it_max))
    {
      it++;

      /* rho=y^T*w and alpha=search^T*w/rho */
      rho=0.0;
      alpha=0.0;
      for (i=0; i<len; i++)
	{
	  rho   +=      y.V[i] * w.V[i];
	  alpha += search.V[i] * w.V[i];
	}
      alpha = alpha / rho;

      /* x      = x      + alpha * p 
	 res    = res    - alpha * w
	 search = search - alpha * y 
      */
      for (i=0; i<len; i++)
	{
	  (*x).V[i]   += alpha * p.V[i];
	  res.V[i]    -= alpha * w.V[i];
	  search.V[i] -= alpha * y.V[i];
	}

      /* get new preconditioned residual norm */
      resnorm=0.0;
      for (i=0; i<len; i++)
	{
	  /* resnorm = p^T p */
	  resnorm   += p.V[i]   * p.V[i];
	}
      resnorm = sqrt(resnorm);
      /* printf("PCR:  it=%3d   |res|=%7.2e \n",it, resnorm); /* */

      /* check if new step is required */
      LOOPCHECK;
      if ((check)&&(it<it_max))
	{
	  /* yes => update other vectors */
	  /* z= Mat* search */
	  err= (*multfunc)( Mat, &search, &z );
	  MULTFUNCERRHANDLE;

	  /* beta = -(z^T*y)/rho */
	  beta=0.0;
	  for (i=0; i<len; i++)
	    {
	      beta += z.V[i] * y.V[i];
	    }
	  beta = -beta / rho;

	  /* p = search + beta * p 
	     w = z      + beta * w
	  */
	  for (i=0; i<len; i++)
	    {
	      p.V[i]  = search.V[i] + beta * p.V[i];
	      w.V[i]  = z.V[i]      + beta * w.V[i];
	    }

	  /* preconditioner */
	  if (precondi!=NULL)
	    {
	      /* y=C^{-1} * w */
	      err= (*precondi)(Mat, &w, prec_dat, &y);
	      PRECONDIERRHANDLE;
	    }
	  else
	    {
	      /* y = w */
	      for (i=0; i<len; i++)
		{
		  y.V[i] = w.V[i];
		}
	    }
	}
      /* no => loop ends anyway */
    }

  if (check)
    {
      /* goal not met yet => failure */
      check=10;
    }
  else
    {
      /* goal me => SUCCESS */
      check=SUCCESS;
    }


  /* freeing of datastructures */
  vector_free(&res);
  vector_free(&p);
  vector_free(&search);
  vector_free(&w);
  vector_free(&y);
  vector_free(&z);
  

#undef PCRERRRET
#undef PCRVECALLOC
#undef MULTFUNCERRHANDLE
#undef PRECONDIERRHANDLE
#undef LOOPCHECK  

  return check;
}




/*FUNCTION*/
int GMRES( int m, int re_max, 
	   int stoptype, double atol, double rtol, 
	   int use_initial_x, struct vector *x, 
	   double *residual, int *iter,
	   int (*multfunc)(void *Mat, struct vector *vec,
			   struct vector *out), 
	   int (*precondi)(void *Mat, struct vector *vec,
			   void *prec_dat, struct vector *out),
	   void *Mat, struct vector *rhs, void *prec_dat
/* Generalised Minimal Residual solver for systems of linear
   equations, see Y. Saad, "Iterative Methods for Sparse Linear
   Systems", (2nd edition, Jan 3rd 2000, pp. 157)
   approximates a solution of 

          Mat*x = rhs

   Input:  m        - number of steps before restart, also number of
                      search directions to be stored,
                      generally, better convergence has to be expected
                      for larger m,
		      however, memory requirements are proportional to
		      m (m vectors have to be stored), and up to m
		      inner products (dot products) have to be
		      performed in each iteration,
           re_max   - maximal number restarts, therefore the maximal
                      number of iterations is m*re_max
	   stoptype - type of the stopping criterion:
                      0 - no criterion, just do re_max restarts,
		      1 - absolute, stop when norm(residual)<atol,
		      2 - relative to the initial residual, stop when 
		          norm(residual) < rtol*norm(residual_0)
		      3 - absolute and relative, stop if either the
		          relative or absolute stopping criterion of
		          above is fulfilled
                      3 - do re_max iterations (!not restarts!), but
                          stop if norm(residual)<=atol
		      other result in FAIL return + error message
           atol     - value of the absolute tolerance for the stopping
                      criteria 1 and 3, see stoptype above
           rtol     - value of the relative tolerance for the stopping
                      criteria 2 and 3, see stoptype above
           use_initial_x
                    - states if the provided x shall be used as
                      initial iterate x_0 or not,
		      = 1 : use it,
		      = 0 : don't use it, in this case the zero vector
		            is used instead
           multfunc - pointer to a function performing the matrix
                      vector multiplication Mat*vec, syntax:
		      int multfunc(void *Mat, struct vector *vec, 
		                   struct vector *out),
		      where Mat is the pointer provided in the call to
		      this routine and vec is a vector of the same
		      type as the solution vector x, 
		      must return SUCCESS on success and FAIL in case
		      of error 
	   precondi - pointer to a function performing the
                      preconditioning operation out=C^{-1}*vec,
		      ideally C should be spectral equivalent to the
		      system matrix Mat, e.g. C^{-1} can be a
		      cheap approximation of the inverse of Mat,
		      the quality of the preconditioner has huge
		      influence on the performance of this solver,
		      syntax:
		      int precondi(void *Mat, struct vector *vec,
		                   void *prec_dat, struct vector *out),
		      where Mat as provided to this routine, vec a
		      vector of same type as solution vector x,
		      prec_dat as provided to this routine, must
		      return SUCCESS on success and FAIL in case of
		      error, 
		      if precondi==NULL then the function is not
		      called but C^{-1}=I is used (no preconditioning)
           Mat      - pointer to data necessary to perform the
                      multiplication with the matrix, 
		      in general this should be some representation of
		      the matrix itself, but it is not used in this
		      routine directly, but only through multfunc and
		      precondi 
	   rhs      - the right-hand side vector of the equation system
	   prec_dat - pointer to additional data for the
	              preconditioner, not used in this routine itself,
	              but only in precondi

   In/Out: x        - on Input:  initial iterate, if use_initial_x==1,
                                 else not used as input,
                      on Output: approximate solution of the system

   Output: residual - norm of the residual of the final iterate,
                      provided by reference, not used if NULL pointer
                      is provided
           iter     - number of the last iteration,
                      provided by reference, not used if NULL pointer
                      is provided

   Return: SUCCESS  - stopping criterion is fulfilled
           FAIL     - general failure, see error messages
	   10       - itermax reached, but stopping criterion not
	              fulfilled 
*/
	   ){
  struct vector res, v_i, w; /* residual, and vectors for the
				proconditioner */
  double *v_all;             /* stored search directions (orthogonal) */
  double *H;                 /* the matrix which states the least
				square system in the Krylov subspace,
				is stored in upper triangular form
				(after applying the rotations
				Omega_i), compact storage, the i-th
				(i starting form 1) collumn starts at
				the ( (i-1)i/2 ) entry of the array */
  double *g;                 /* rhs of the least square sustem in the
				subspace */
  double *y;                 /* solution of the least square sustem in
				the subspace */
  double *Ohm;               /* the rotations which transform the
				Hermite matrix of the least square
				sustem in the subspace into  upper
				triangular form, c_i=Ohm[i*2+0],
				s_i=Ohm[i*2+1] */
  double *vi, *wp;           /* help pointer */
  double resnorm, iniresnorm;/* the two residual norms */

  double dotp, scale, ci, si, hi0, hi1; /* helpers */
  int restart, it, check;
  FIDX i, j, len;
  int err;

  len=(*x).len;

  /* make sure a vector_free or free does not result in segfaults */
  res.V=NULL;
  v_all=NULL;
  H=NULL;
  g=NULL;
  y=NULL;
  Ohm=NULL;

  /* v_i and w are "dummies" which are used to pass parts of v_all to
     the preconditioner, they get the same length as x, the pointers
     will point to the begin of corresponding parts in v_all,
     initialise them: */
  v_i.V=NULL;
  v_i.len=len;
  v_i.n_max=len;

  w.V=NULL;
  w.len=len;
  w.n_max=len;

  /* useful macros */
  /* free data and return failure */
#define GMRESERRRET {\
      vector_free(&res);\
      free(v_all);\
      free(H);\
      free(g);\
      free(y);\
      free(Ohm);\
      return FAIL; }


  /* allocate double arrays */
#define GMRESDMALLOC(__who__, __somuch__) {\
  __who__=malloc( (__somuch__) * sizeof(double) ); \
  if (__who__==NULL) \
    { \
      fprintf(stderr, \
	      "failure during init of GMRES, malloc for %s\n", \
              #__who__);\
      GMRESERRRET;\
    }\
  }
  

  /* error handler for user provided functions */
#define MULTFUNCERRHANDLE {if (err!=SUCCESS) \
	{ fprintf(stderr, "failure of multfunc in GMRES\n"); \
          GMRESERRRET;\
	} \
      }
#define PRECONDIERRHANDLE {if (err!=SUCCESS) \
	{ fprintf(stderr, "failure of precondi in GMRES\n"); \
          GMRESERRRET;\
	} \
      }

  /* check for convergence */
#define LOOPCHECK { \
    /* printf("GMRES: restart=%3d it=%3d  |res|=%10.3e\n", restart,	\
       it, resnorm); /* */						\
  if (residual != NULL) \
    { *residual = resnorm; } \
  if (stoptype==0) \
    { check=1; } \
  else if (stoptype==1) \
    { check=(resnorm>atol); } \
  else if (stoptype==2) \
    { check=(resnorm>rtol*iniresnorm); } \
  else if (stoptype==3) \
    { check=((resnorm>atol)&&(resnorm>rtol*iniresnorm)); }	\
  else if ((stoptype==4)&&((iter != NULL))) \
    { check=( ((*iter)<re_max) && (resnorm>atol) ); }	\
  else \
    { fprintf(stderr, "unknown stoptype=%d in GMRES !\n", stoptype); \
      GMRESERRRET; \
    } \
  }

#define ITERCOUNT { \
  if (iter     != NULL) \
    { (*iter) ++; } \
  }


  /* malloc of datastructures */
  err=vector_alloc( &res, len );
  if (err!=SUCCESS)
    {
      fprintf(stderr,
	      "failure during init of GMRES, vector_alloc for %s\n", 
              "res");
      GMRESERRRET;
    }
  GMRESDMALLOC( v_all, len * (m +1)  );
  GMRESDMALLOC( H,     ((m+1)*m)/2+1 );
  GMRESDMALLOC( g,     m+1           );
  GMRESDMALLOC( y,     m             );
  GMRESDMALLOC( Ohm,   2*m           );


  /* initialisation of iteration */
  /* total iterations */
  if (iter != NULL) *iter=0; 
  /* restart counter */
  restart=0;
  check=1;
  while ((restart<re_max)&&(check))
    {
      /* inner iteration counter */
      it=1; 
      ITERCOUNT;

      /* residual */
      if ((restart>0)||(use_initial_x==1))
	{
	  /* res = Mat*x */
	  err= (*multfunc)( Mat, x, &res);
	  MULTFUNCERRHANDLE;
	  /* res = rhs - Mat*x */
	  for (i=0; i<len; i++)
	    res.V[i]=(*rhs).V[i]-res.V[i];
	}
      else
	{
	  /* x = 0, res = rhs */
	  for (i=0; i<len; i++)
	    {
	      res.V[i]=(*rhs).V[i];
	      (*x).V[i]=0;
	    }
	}

      /* preconditioned residual goes into v_1 */
      v_i.V=&v_all[0];
      if (precondi!=NULL)
	{
	  /* v_1 = C^{-1} * res */
	  err= (*precondi)(Mat, &res, prec_dat, &v_i);
	  PRECONDIERRHANDLE;
	}
      else
	{
	  for (i=0; i<len; i++)
	    {
	      /* v_1 = res */
	      v_i.V[i]=res.V[i];
	    }
	}
      
      /* norm of v_1 */
      dotp=0.0;
      for (i=0; i<len; i++)
	{
	  dotp += v_all[i] * v_all[i];
	}
      resnorm=sqrt(dotp);
      if (restart==0) 
	{
	  iniresnorm=resnorm;
	}

      /* scale v_1 to norm=1 */
      scale= 1.0/resnorm;
      for (i=0; i<len; i++)
	{
	  v_all[i] *= scale;
	}

      /* initialise g= resnorm* e_1 */
      g[0]=resnorm;
      for (i=1; i<m+1; i++)
	{
	  g[i]=0.0;
	}

      LOOPCHECK;

      /* inner loop */
      while ((it<=m)&&(check))
	{
	  /* res = Mat*v_it */
	  v_i.V= &v_all[(it-1)*len];
	  err= (*multfunc)( Mat, &v_i, &res);
	  MULTFUNCERRHANDLE;

	  /* v_(it+1)= w = C^{-1} * res */
	  wp= &v_all[(it)*len] ;
	  w.V= wp;
	  if (precondi!=NULL)
	    {
	      err= (*precondi)(Mat, &res, prec_dat, &w);
	      PRECONDIERRHANDLE;
	    }
	  else 
	    {
	      for (i=0; i<len; i++)
		{
		  /* y = w */
		  w.V[i]=res.V[i];
		}
	    }

	  /* build raw new collumn of H, orthogonalise w to all v_i */
	  for (i=0; i<it; i++)
	    {
	      /* H_{i,it}= (w,v_i) */
	      dotp=0.0;
	      vi=&v_all[i*len];
	      for (j=0; j<len; j++)
		{
		  dotp+=vi[j]*wp[j];
		}
	      H[((it-1)*it)/2+i]=dotp;

	      /* w = w - (w,v_i)*v_i */
	      for (j=0; j<len; j++)
		{
		  wp[j]-=dotp*vi[j];
		}
	    }

	  /* H_{it+1,it}= norm(w) */
	  dotp=0.0;
	  for (j=0; j<len; j++)
	    {
	      dotp+= wp[j]*wp[j];
	    }
	  dotp=sqrt(dotp);
	  H[((it-1)*it)/2+it]=dotp;

	  /* v_(it+1)= w/norm(w) */
	  scale = 1.0/dotp;
	  for (j=0; j<len; j++)
	    {
	      wp[j] *= scale;
	    }
	  /* apply Ohm[1..it-1] to H_{.,it} */
	  for (i=0; i<it-1; i++)
	    {
	      ci=Ohm[i*2  ];
	      si=Ohm[i*2+1];

	      hi0=H[((it-1)*it)/2+ i  ];
	      hi1=H[((it-1)*it)/2+ i+1];

	      H[((it-1)*it)/2+ i  ]=  ci*hi0 + si*hi1;
	      H[((it-1)*it)/2+ i+1]= -si*hi0 + ci*hi1;
	    }

	  /* construct Ohm[it] (see Saad, p 160..161) */
	  hi0=H[((it-1)*it)/2+ it-1];
	  hi1=H[((it-1)*it)/2+ it  ];

	  dotp=sqrt( hi0*hi0 + hi1*hi1 );
	  si=hi1/dotp;
	  ci=hi0/dotp;

	  Ohm[(it-1)*2  ]=ci;
	  Ohm[(it-1)*2+1]=si;

	  /* apply Ohm[it] to  H_{.,it} */
	  H[((it-1)*it)/2+ it-1]=  ci*hi0 + si*hi1;

	  /* deleteme 
	     printf ("H_{it+1,it}=0 == %e\n", -si*hi0 + ci*hi1); */

	  /* apply Ohm[it] to g */
	  hi0= g[it-1];
	  hi1= g[it  ];

	  g[it-1]=  ci*hi0 + si*hi1;
	  g[it  ]= -si*hi0 + ci*hi1;

	  /* resnorm = g[it+1] */
	  resnorm = fabs(g[it]);
	  
	  /* end of the inner loop */
	  it++;
	  ITERCOUNT;
	  LOOPCHECK;
	}
      /* correct the iteration counter to give the final inner iterate
      */
      it--;

      /* solve the upper triangular system H*y=g in least squares
	 sense (backward substitution) */
      /* last line first */
      for (i=it-1; i>=0; i--)
	{
	  /* substitution of already known solution values */
	  for (j=it-1; j>i; j--)
	    {
	      g[i] -= y[j]*H[((j+1)*j)/2+ i ];
	    }
	  /* solve the remaining part, which only consists of the
	     diagonal entry */
	  hi0=H[((i+1)*i)/2+ i ];
	  if (hi0!=0.0)
	    {
	      y[i]=g[i]/hi0;
	    }
	  else
	    {
	      /* cry and set y[i]=0 */
	      printf("GMRES: H with zero diagonal entry! restart=%d, it=%d\n",
		     restart, it);
	      y[i]=0.0;
	    }
	}

      /* update x = x + y[0]*v_1 +y[1]*v_2 ...  */
      for (i=0; i<it; i++)
	{
	  scale=y[i];
	  vi=&v_all[i*len];
	  for (j=0; j<len; j++)
	    {
	      (*x).V[j] += scale*vi[j];
	    }
	}

      /* be carefull in case the preconditioner is not really linear,
	 e.g. if it is a low accuracy approximation of a
	 preconditioner, therefore we force a restart if more than 3
	 iterations where done in the last inner loop */
      if (it>3) check=1;

      /* end restart loop */
      restart++;
    }

  if (check)
    {
      /* goal not met yet => failure */
      check=10;
    }
  else
    {
      /* goal met => SUCCESS */
      check=SUCCESS;
    }

  /* freeing of datastructures */
  vector_free(&res);
  free(v_all);
  free(H);
  free(g);
  free(y);
  free(Ohm);
  

#undef GMRESERRRET
#undef GMRESMALLOC
#undef MULTFUNCERRHANDLE
#undef PRECONDIERRHANDLE
#undef LOOPCHECK  

  return check;
}



/*FUNCTION*/
int chebyshev_semi_iteration(  
	   struct vector *x,
	   int iter,
	   int (*multfunc)(void *Mat, struct vector *vec,
			   struct vector *out), 
	   int (*precondi)(void *Mat, struct vector *vec,
			   void *prec_dat, struct vector *out),
	   void *Mat, struct vector *rhs, void *prec_dat,
	   double lambdaMin, double lambdaMax
/* Chebyshev semi iteration for solving systems of linear
   equations, see ???
   approximates a solution of 

          Mat*x = rhs

   Input:  iter     - the fixed number of iterations to be performed,
           multfunc - pointer to a function performing the matrix
                      vector multiplication Mat*vec, syntax:
		      int multfunc(void *Mat, struct vector *vec, 
		                   struct vector *out),
		      where Mat is the pointer provided in the call to
		      this routine and vec is a vector of the same
		      type as the solution vector x, 
		      must return SUCCESS on success and FAIL in case
		      of error 
	   precondi - pointer to a function performing the
                      preconditioning operation out=C^{-1}*vec,
		      ideally C should be spectral equivalent to the
		      system matrix Mat, e.g. C^{-1} can be a
		      cheap approximation of the inverse of Mat,
		      the quality of the preconditioner has huge
		      influence on the performance of this solver,
		      syntax:
		      int precondi(void *Mat, struct vector *vec,
		                   void *prec_dat, struct vector *out),
		      where Mat as provided to this routine, vec a
		      vector of same type as solution vector x,
		      prec_dat as provided to this routine, must
		      return SUCCESS on success and FAIL in case of
		      error, 
		      if precondi==NULL then the function is not
		      called but C^{-1}=I is used (no preconditioning)
           Mat      - pointer to data necessary to perform the
                      multiplication with the matrix, 
		      in general this should be some representation of
		      the matrix itself, but it is not used in this
		      routine directly, but only through multfunc and
		      precondi 
	   rhs      - the right-hand side vector of the equation system
	   prec_dat - pointer to additional data for the
	              preconditioner, not used in this routine itself,
	              but only in precondi
	   lambdaMin- minimal eigenvalue of the preconditioned system
	   lambdaMax- maximal eigenvalue of the preconditioned system

   Output: x        - approximate solution of the system
           
   Return: SUCCESS  - stopping criterion is fulfilled
           FAIL     - general failure, see error messages
*/
	   ){
  struct vector res, *ynew, *ymid, *yold, *tmp, y1, y2, y3, z; 
                             /* residual, and vectors for the
				iteration */
  double omega,rho,scale;

  int it;
  FIDX i, len;
  int err;

  len=(*x).len;

  /* make sure a vector_free or free does not result in segfaults */
  res.V=NULL;

  err=vector_alloc(&z, len);
  FUNCTION_FAILURE_HANDLE(err, vector_alloc, chebyshev_semi_iteration);

  err=vector_alloc(&y1, len);
  FUNCTION_FAILURE_HANDLE(err, vector_alloc, chebyshev_semi_iteration);
  err=vector_alloc(&y2, len);
  FUNCTION_FAILURE_HANDLE(err, vector_alloc, chebyshev_semi_iteration);
  err=vector_alloc(&y3, len);
  FUNCTION_FAILURE_HANDLE(err, vector_alloc, chebyshev_semi_iteration);

  err=vector_alloc(&res, len);
  FUNCTION_FAILURE_HANDLE(err, vector_alloc, chebyshev_semi_iteration);
  
  ynew=&y3;
  ymid=&y2;
  yold=&y1;

  for (i=0; i<len; i++)
    {
      ymid->V[i] = 0.0;
      yold->V[i] = 0.0;
    }

  omega = 1.0;
      
  rho = (lambdaMax-lambdaMin)/(lambdaMax+lambdaMin);
  scale= 2.0/(lambdaMin+lambdaMax);

  for (it=0; it < iter; it++)
    {
      omega = 1.0/(1.0-(omega*rho*rho)/4);

      /* res = Mat*ymid */
      err= (*multfunc)( Mat, ymid, &res);
      FUNCTION_FAILURE_HANDLE(err, multfunc, chebyshev_semi_iteration);
      /* res = rhs - Mat*ymid */
      for (i=0; i<len; i++)
	{
	  res.V[i]=(*rhs).V[i]-res.V[i];
	}

      /* z = C^-1 * res; */ 
      err= (*precondi)(Mat, &res, prec_dat, &z);
      FUNCTION_FAILURE_HANDLE(err, precondi, chebyshev_semi_iteration);

      /* ynew = omega*(scale*z+ymid-yold)+yold */
      for (i=0; i<len; i++)
	{
	  ynew->V[i]=omega*(scale*z.V[i]+ymid->V[i]-yold->V[i])+yold->V[i];
	}


      /* yold = ymid; ymid = ynew; */
      tmp=yold;
      yold=ymid;
      ymid=ynew;
      ynew=tmp;

    } /* end for it */


  /* copy result to x */
  for (i=0; i<len; i++)
    {
      x->V[i]=ymid->V[i];
    }

  /* clean up */
  vector_free(&res);

  vector_free(&y3);
  vector_free(&y2);
  vector_free(&y1);

  vector_free(&z);

  return SUCCESS;
}



/*FUNCTION*/
int PBSDeig(int it_max, int stoptype, double atol, double rtol,
	    int use_initial_x,
	    FIDX len, FIDX num,
	    struct vector *x, struct vector *lambda,
	    double *residual, int *iter,
	    int (*multfuncMass)(void *MatMass, struct vector *vec,
				struct vector *out), 
	    int (*multfuncStif)(void *MatStif, struct vector *vec,
				struct vector *out), 
	    int (*precondi)(void *MatStif, struct vector *vec, void *prec_dat,
			    struct vector *out),
	    void *MatMass, void *MatStif,
	    void *prec_dat
/* Preconditioned Block Steepest Descent solver for eigenvalue problems

          MatStif*x = lambda*MatMass*x 
   
   as described in [M. Weise: Eigenfrequenzanalyse realer
   CAD-Geometrien mit NetGen/NGSolve, Modellierungsseminar,
   TU-Chemnitz, 2009]
   where [A. V. Knyazev, K. Neymeyr: Efficient solution of symmetric
   eigenvalue problems using multigrid preconditioners in the locally
   optimal block conjugate gradient method, ETNA, Vol. 15 2003;
   http://math.ucdenver.edu/~aknyazev/research/papers/etna03.pdf]
   is cited as reference 

   seeks to find the num eigenvalues of smallest absolute value 


   Input:  it_max   - maximal number of iterations
	   stoptype - type of the stopping criterion:
                      0 - no criterion, just do it_max iterations,
		      1 - absolute, stop when norm(residual)<atol,
		      2 - relative to the initial residual, stop when 
		          norm(residual) < rtol*norm(residual_0)
		      3 - absolute and relative, stop if either the
		          relative or absolute stopping criterion of
		          above is fulfilled
		      other result in FAIL return + error message
           atol     - value of the absolute tolerance for the stopping
                      criteria 1 and 3, see stoptype above
           rtol     - value of the relative tolerance for the stopping
                      criteria 2 and 3, see stoptype above
           use_initial_x
                    - states if the provided x shall be used as
                      initial iterate x_0 or not,
		      = 1 : use it,
		      = 0 : don't use it, in this case pseudo-random 
		            numbers are used instead
           len      - size of the system matrices, 
	              (length of compatible vectors)
           num      - number of (smallest) eigenvalues and corresponding
	              eigenvectors to be computed
	   multfuncMass AND 
	   multfuncStif 
                    - pointer to a function performing the matrix
                      vector multiplication Mat*vec, syntax:
		      int multfunc(void *Mat, struct vector *vec, 
		                   struct vector *out),
		      where Mat is the pointer provided in the call to
		      PBSDeig and vec is a vector of size len, 
		      must return SUCCESS on success and FAIL in case
		      of error 
	   precondi - pointer to a function performing the
                      preconditioning operation out=C^{-1}*vec,
		      ideally C should be spectral equivalent to the
		      stiffness matrix MatStif, e.g. C^{-1} can be a
		      cheap approximation of the inverse of MatStif,
		      the quality of the preconditioner has huge
		      influence on the performance of PBSDeig,
		      syntax:
		      int precondi(void *MatStif, struct vector *vec,
		                   void *prec_dat, struct vector *out),
		      where Mat as provided to PBSDeig, vec a vector of
		      size len, prec_dat as
		      provided to PBSDeig, must return SUCCESS on success
		      and FAIL in case of error,
		      if precondi==NULL then the function is not
		      called but C^{-1}=I is used (no preconditioning)
           MatMass    AND
           MatStif  - pointer to data necessary to perform the
                      multiplication with the matrix, 
		      in general this should be some representation of
		      the matrix itself, but it is not used in PBSDeig
		      directly, but only through multfuncMass, 
		      multfuncStif and precondi
	   prec_dat - pointer to additional data for the
	              preconditioner, not used in PCG itself, but only
	              in precondi

   In/Out: x        - on Input:  initial iterates for all eigenvectors,
                                 if use_initial_x==1,
                                 else not used as input (then initialised
				               with pseudo random numbers),
                      on Output: approximate eigenvectors,
		      size: len*num,
		      x.V[k*len+i] is the i-th component of the k-th 
		      eigenvector

   Output: lambda   - the eigenvalues
           residual - norm of the residual of the final iterate,
                      provided by reference, not used if NULL pointer
                      is provided
           iter     - number of the last iteration,
                      provided by reference, not used if NULL pointer
                      is provided

   Return: SUCCESS  - stopping criterion is fulfilled
           FAIL     - general failure, see error messages
	   10       - itermax reached, but stopping criterion not fulfilled
*/
	 ){
  struct vector  basisV, resK, 
    Xk, A_Xk, Mass_Xk,
    basisVk, precoK;
  double *VtAV, *VtMV;
  double resnorm, iniresnorm;
  int it;
  FIDX i, j, k;
  int stop;

  double *fd_lambda;
  int LWORK=-1;
  double *fd_WORK;

  int err;
  
  if ( (*x).len != len*num)
    {
      fprintf(stderr, "PBSDeig: size of x wrong\n");
      return FAIL;
    }
  if ( (*lambda).len != num)
    {
      fprintf(stderr, "PBSDeig: size of lambda wrong\n");
      return FAIL;
    }

  /* allocate memory for temporary vectors */
  /* the Basis of the Ritz space will consist of the vectors [x,preco],
     each consisting of num vectors, thus 2*num in total */
  err=vector_alloc( &basisV, 2*num*len);
  FUNCTION_FAILURE_HANDLE(err, vector_alloc, PBSDeig);
  err=vector_alloc( &resK, len);
  FUNCTION_FAILURE_HANDLE(err, vector_alloc, PBSDeig);
  err=vector_alloc( &A_Xk, len);
  FUNCTION_FAILURE_HANDLE(err, vector_alloc, PBSDeig);
  err=vector_alloc( &Mass_Xk, len);
  FUNCTION_FAILURE_HANDLE(err, vector_alloc, PBSDeig);

  TRY_MALLOC(VtAV, 2*num*2*num, double, PBSDeig);
  TRY_MALLOC(VtMV, 2*num*2*num, double, PBSDeig);

  /* init eigenvector approximations if necessary */
  if (use_initial_x != 1)
    {
      int seed[4]={ 123, 234, 345, 567 };
      err=vector_random(x, seed);
      FUNCTION_FAILURE_HANDLE(err, vector_random, PBSDeig);
    }


  it=0;
  stop=0;
  while ((it<it_max)&&(!stop))
    {
      it++;

      /* to get residuals that are comparable to later ones
	 Mass-normalise the initial vectors
      */
      /* fprintf(stderr, "PBSDeig: DEBUG it=%3d scale ", it); */
      for (k=0; k<num; k++)
	{
	  double normXk2=0.0;
	  double scale;
	  
	  Xk.len=len;
	  Xk.V=&(*x).V[k*len];

	  /* compute Xk'*M*Xk */
	  err= (*multfuncMass)( MatMass, &Xk, &Mass_Xk);
	  FUNCTION_FAILURE_HANDLE(err, multfuncMass, PBSDeig);
	  for (i=0; i<len; i++)
	    {
	      normXk2 += Xk.V[i] * Mass_Xk.V[i];
	    }

	  scale=1.0/sqrt(normXk2);

	  basisVk.len=len;
	  basisVk.V  =&basisV.V[k*len];
	  for (i=0; i<len; i++)
	    {
	      basisVk.V[i] = scale * Xk.V[i];
	    }
	  /* fprintf(stderr,"DEBG %8.1e ", scale); /* */
	}

      /* compute the eigenvalue approximations 
	 (Ritz-values) and preconditioned residuals, 
      */
      for (k=0; k<num; k++)
	{
	  double XkT_A_Xk=0.0;
	  double lamK;

	  Xk.len=len;
	  Xk.V=&basisV.V[k*len];
	  
	  /* compute Xk'*A*Xk */
	  err= (*multfuncStif)( MatStif, &Xk, &A_Xk);
	  FUNCTION_FAILURE_HANDLE(err, multfuncStif, PBSDeig);
	  for (i=0; i<len; i++)
	    {
	      XkT_A_Xk += Xk.V[i] * A_Xk.V[i];
	    }

	  /* since Xk'*M*Xk =1, XkT_A_Xk is now the Ritz value */
	  lamK=XkT_A_Xk;
	  (*lambda).V[k]=lamK;

	  /* compute M*Xk */
	  err= (*multfuncMass)( MatMass, &Xk, &Mass_Xk);
	  FUNCTION_FAILURE_HANDLE(err, multfuncMass, PBSDeig);
	  
	  /* compute resK = A_Xk   - lamK * Mass_Xk
	     .            = A * Xk - lamK *  M * Xk  */
	  for (i=0; i<len; i++)
	    {
	      resK.V[i] = A_Xk.V[i] - lamK*Mass_Xk.V[i];
	    }

	  /* precondition the residual */
	  precoK.len = len;
	  precoK.V   = &basisV.V[(k+num)*len];
	  err= (*precondi)( MatStif, &resK, prec_dat, &precoK);
	  FUNCTION_FAILURE_HANDLE(err, precondi, PBSDeig); 
	  
	  //for (i=0; i<len; i++)  precoK.V[i] = resK.V[i];

	} /* for k */


      /* now compute VtAV and VtMV */
      for (k=0; k<2*num; k++)
	{
	  Xk.len=len;
	  Xk.V=&basisV.V[k*len];

	  err= (*multfuncMass)( MatMass, &Xk, &Mass_Xk);
	  FUNCTION_FAILURE_HANDLE(err, multfuncMass, PBSDeig);
      	  err= (*multfuncStif)( MatStif, &Xk, &A_Xk);

	  for (j=0; j<2*num; j++)
	    {
	      double XjT_A_Xk=0.0;
	      double XjT_M_Xk=0.0;

	      struct vector Xj;
	      Xj.len = len;
	      Xj.V   = &basisV.V[j*len];

	      for (i=0; i<len; i++)
		{
		  XjT_A_Xk += Xj.V[i] * A_Xk.V[i];
		  XjT_M_Xk += Xj.V[i] * Mass_Xk.V[i];
		}

	      VtAV[j*(2*num)+k] = XjT_A_Xk;
	      VtAV[k*(2*num)+j] = XjT_A_Xk;
	      VtMV[j*(2*num)+k] = XjT_M_Xk;
	      VtMV[k*(2*num)+j] = XjT_M_Xk;
	    }
	} /* for k */

      /* compute M-norm of preconditioned residual */
      resnorm=0.0;
      for (k=0; k<num; k++)
	{
	  double normPrecoK;

	  normPrecoK = VtMV[(num+k)*(2*num)+(num+k)];

	  /* fprintf(stderr,"DEBUG: res[%d]=%8.1e\n",k,sqrt(normPrecoK)); /* */

	  if ( resnorm<sqrt(normPrecoK) )
	    {
	      resnorm=sqrt(normPrecoK);
	    }
	} /* for k */
      if (it==1)
	{
	  iniresnorm=resnorm;
	}
      if (residual!=NULL)
	{
	  (*residual)=resnorm;
	}
      if (iter!=NULL)
	{
	  (*iter)=it;
	}
      
      switch (stoptype)
	{ 
	case 0:
	  stop=0;
	  break;
	case 1:
	  stop=(resnorm<=atol);
	  break;
	case 2:
	  stop=(resnorm<=rtol*iniresnorm);
	  break;
	case 3:
	  stop=((resnorm<=atol)&&(resnorm<=rtol*iniresnorm));
	  break;
	default:
	  fprintf(stderr, "PBSDeig: unknown stoptype=%d !\n", stoptype);
	  return FAIL;
	}

      /* fprintf(stderr,"\nDEBUG: VtAV \n");
	 for (k=0; k<2*num; k++)
	 {
	 for (j=0; j<2*num; j++)
	 fprintf(stderr," %e ",  VtAV[j*(2*num)+k]);
	 fprintf(stderr,"\n");
	 }
	 fprintf(stderr,"\nDEBUG: VtMV \n");
	 for (k=0; k<2*num; k++)
	 {
	 for (j=0; j<2*num; j++)
	 fprintf(stderr," %e ",  VtMV[j*(2*num)+k]);
	 fprintf(stderr,"\n");
	}
      */


      /* compute the new Ritz basis */
      {
	char fc_v, fc_u;
	int fi_one, fi_m_one, fi_2num, info;
	double fd_ask_LWORK;

	fi_one   = 1; 
	fi_m_one = -1; 
	fc_v     = 'V';
	fc_u     = 'U';
	fi_2num   = 2*num;
	

	/* compute eigenvalues and eigenvectors 
	   VtAV*x = lambda*VtMV*x */
	if (LWORK==-1)
	  {
	    /* first do a workspace query and allocate memory */
	    TRY_MALLOC(fd_lambda, 2*num, double, PBSDeig);

	    dsygv_( &fi_one, &fc_v, &fc_u, &fi_2num, 
		    VtAV, &fi_2num, VtMV, &fi_2num, fd_lambda,
		    &fd_ask_LWORK, &fi_m_one, &info );
	    if (info!=0)
	      {
		fprintf(stderr, "PBSDeig: "
			"LAPACK routine dsygv_ failed in workspace query "
			"info=%d \n", info);

		return FAIL;
	      }
	    LWORK=lrint(fd_ask_LWORK)+1;
	    TRY_MALLOC(fd_WORK, LWORK, double, PBSDeig);
	    /* fprintf(stderr,"DEBUG: LWORK set to %d\n", LWORK); */
	  }

	/* the actual computation */
	dsygv_( &fi_one, &fc_v, &fc_u, &fi_2num, 
		VtAV, &fi_2num, VtMV, &fi_2num, fd_lambda,
		fd_WORK, &LWORK, &info );
	if (info!=0)
	  {
	    fprintf(stderr, "PBSDeig: "
		    "LAPACK routine dsygv_ failed eigenvalue computation "
		    "info=%d \n", info);
	    return FAIL;
	  }

	/* DEBUG: output eigenvalue approximations */
	printf("PBSDeig: it=%3d ", it);
	for (i=0; i<num; i++)
	  {
	    printf("  %13.6e ", fd_lambda[i]);
	  }
	printf(" end resn=%8.1e\n",resnorm);

	/* now choose the first num small eigenvectors and compute
	   corresponding large eigenvector approximations,
	   VtAV holds the small eigenvectors
	*/
	for (k=0; k<num; k++)
	  {
	    double scale;

	    Xk.len=len;
	    Xk.V=&(*x).V[k*len];

	    /* first column of V */
	    basisVk.len=len;
	    basisVk.V=&basisV.V[0];
	    scale=VtAV[k*(2*num)+0];
	    for (i=0; i<len; i++)
	      {
		Xk.V[i]= scale * basisVk.V[i];
	      }

	    /* remaining columns of V */
	    for (j=1; j<2*num; j++)
	      {
		basisVk.len=len;
		basisVk.V=&basisV.V[j*len];
		scale=VtAV[k*(2*num)+j];
		for (i=0; i<len; i++)
		  {
		    Xk.V[i]+= scale * basisVk.V[i];
		  }
	      }
	  } /* end for k */
      } /* end Fortran compatibility block, computation of new Ritz vectors */ 
    } /* while (!stop) */

  /* free temporary memory */
  free(fd_WORK);
  free(fd_lambda);
  free(VtMV);
  free(VtAV);
  vector_free(&resK);
  vector_free(&A_Xk);
  vector_free(&Mass_Xk);
  vector_free(&basisV);


  return SUCCESS;
}



/*FUNCTION*/
int LOBPCGeig(int it_max, int stoptype, double atol, double rtol,
	      int use_initial_x,
	      FIDX len, FIDX num, FIDX num_chk,
	      struct vector *x, struct vector *lambda,
	      double *residual, int *iter,
	      int (*multfuncMass)(void *MatMass, struct vector *vec,
				  struct vector *out), 
	      int (*multfuncStif)(void *MatStif, struct vector *vec,
				  struct vector *out), 
	      int (*precondi)(void *MatStif, struct vector *vec, void *prec_dat,
			      struct vector *out),
	      int (*projectr)(void *MatStif, struct vector *vec, void *proj_dat,
			      struct vector *out),
	      void *MatMass, void *MatStif,
	      void *prec_dat, double *prec_lambda_i, void *proj_dat
/* Locally Optimal Block Preconditioned Conjugate Gradient Method for
   eigenvalue problems

          MatStif*x = lambda*MatMass*x 
   
   as described in [M. Weise: Eigenfrequenzanalyse realer
   CAD-Geometrien mit NetGen/NGSolve, Modellierungsseminar,
   TU-Chemnitz, 2009]
   where [A. V. Knyazev, K. Neymeyr: Efficient solution of symmetric
   eigenvalue problems using multigrid preconditioners in the locally
   optimal block conjugate gradient method, ETNA, Vol. 15 2003;
   http://math.ucdenver.edu/~aknyazev/research/papers/etna03.pdf]
   is cited as reference 

   seeks to find the num eigenvalues of smallest absolute value 


   Input:  it_max   - maximal number of iterations
	   stoptype - type of the stopping criterion:
                      0 - no criterion, just do it_max iterations,
		      1 - absolute, stop when norm(residual)<atol,
		      2 - relative to the initial residual, stop when 
		          norm(residual) < rtol*norm(residual_0)
		      3 - absolute and relative, stop if either the
		          relative or absolute stopping criterion of
		          above is fulfilled
		      other result in FAIL return + error message
           atol     - value of the absolute tolerance for the stopping
                      criteria 1 and 3, see stoptype above
           rtol     - value of the relative tolerance for the stopping
                      criteria 2 and 3, see stoptype above
           use_initial_x
                    - states if the provided x shall be used as
                      initial iterate x_0 or not,
		      = 1 : use it,
		      = 0 : don't use it, in this case pseudo-random 
		            numbers are used instead
           len      - size of the system matrices, 
	              (length of compatible vectors)
           num      - number of (smallest) eigenvalues and corresponding
	              eigenvectors to be used in computation
           num_chk  - number of (smallest) eigenvectors to be used in the
                      stopping criterion, so only num_chk eigenvalues will
                      actually fulfill the tolerances, has to fulfill 
		      num_chk<=num 
	   multfuncMass AND 
	   multfuncStif 
                    - pointer to a function performing the matrix
                      vector multiplication Mat*vec, syntax:
		      int multfunc(void *Mat, struct vector *vec, 
		                   struct vector *out),
		      where Mat is the pointer provided in the call to
		      LOBPCGeig and vec is a vector of size len, 
		      must return SUCCESS on success and FAIL in case
		      of error 
	   precondi - pointer to a function performing the
                      preconditioning operation out=C^{-1}*vec,
		      ideally C should be spectral equivalent to the
		      stiffness matrix MatStif, e.g. C^{-1} can be a
		      cheap approximation of the inverse of MatStif,
		      the quality of the preconditioner has huge
		      influence on the performance of LOBPCGeig,
		      syntax:
		      int precondi(void *MatStif, struct vector *vec,
		                   void *prec_dat, struct vector *out),
		      where Mat as provided to LOBPCGeig, vec a vector of
		      size len, prec_dat as
		      provided to LOBPCGeig, must return SUCCESS on success
		      and FAIL in case of error,
		      if precondi==NULL then the function is not
		      called but C^{-1}=I is used (no preconditioning)
		      if the preconditioner needs the eigenvalue 
		      approximation, use prec_lambda_i, see below
	   projectr - pointer to a function performing the
                      projection operation out=P*vec,
		      we look for eigenvectors only in span(P)
		      syntax:
		      int precondi(void *MatStif, struct vector *vec,
		                   void *proj_dat, struct vector *out),
		      where Mat as provided to LOBPCGeig, vec a vector of
		      size len, prec_dat as
		      provided to LOBPCGeig, must return SUCCESS on success
		      and FAIL in case of error,
		      if projectr==NULL then the function is not
		      called but P=I is used (no projection)
           MatMass    AND
           MatStif  - pointer to data necessary to perform the
                      multiplication with the matrix, 
		      in general this should be some representation of
		      the matrix itself, but it is not used in LOBPCGeig
		      directly, but only through multfuncMass, 
		      multfuncStif and precondi
	   prec_dat - pointer to additional data for the
	              preconditioner, not used in LOBPCGeig itself, but only
	              in precondi, see also prec_lambda_i below
	   proj_dat - pointer to additional data for the
	              preconditioner, not used in LOBPCGeig itself, but only
	              in projectr

   In/Out: x        - on Input:  initial iterates for all eigenvectors,
                                 if use_initial_x==1,
                                 else not used as input (then initialised
				               with pseudo random numbers),
                      on Output: approximate eigenvectors,
		      size: len*num,
		      x.V[k*len+i] is the i-th component of the k-th 
		      eigenvector
	   prec_lambda_i
                    - pointer where the eigenvalue corresponding to a
                      residual vector may be written for the
                      preconditioner,
		      if ==NULL it is ignored

   Output: lambda   - the eigenvalues
           residual - norm of the residual of the final iterate,
                      provided by reference, not used if NULL pointer
                      is provided
           iter     - number of the last iteration,
                      provided by reference, not used if NULL pointer
                      is provided

   Return: SUCCESS  - stopping criterion is fulfilled
           FAIL     - general failure, see error messages
	   10       - itermax reached, but stopping criterion not fulfilled
*/
	 ){
  struct vector  basisV, resK, 
    Xk, A_Xk, Mass_Xk,
    basisVk, precoK, p;
  double *VtAV, *VtMV;
  double resnorm, iniresnorm;
  int it, basnum;
  FIDX i, j, k;
  int stop;
  int *basis_select;

  double *fd_lambda;
  int LWORK=-1;
  double *fd_WORK;

  int err;
  
  if ( num_chk>num )
    {
      fprintf(stderr, "LOBPCGeig: num_chk > num not allowed \n");
      return FAIL;
    }
  if ( (*x).len != len*num)
    {
      fprintf(stderr, "LOBPCGeig: size of x wrong\n");
      return FAIL;
    }
  if ( (*lambda).len != num)
    {
      fprintf(stderr, "LOBPCGeig: size of lambda wrong\n");
      return FAIL;
    }

  /* allocate memory for temporary vectors */
  /* the Basis of the Ritz space will consist of the vectors [x,preco,p],
     each consisting of num vectors, thus 3*num in total */
  err=vector_alloc( &basisV, 3*num*len);
  FUNCTION_FAILURE_HANDLE(err, vector_alloc, LOBPCGeig);
  err=vector_alloc( &resK, len);
  FUNCTION_FAILURE_HANDLE(err, vector_alloc, LOBPCGeig);
  err=vector_alloc( &A_Xk, len);
  FUNCTION_FAILURE_HANDLE(err, vector_alloc, LOBPCGeig);
  err=vector_alloc( &Mass_Xk, len);
  FUNCTION_FAILURE_HANDLE(err, vector_alloc, LOBPCGeig);
  err=vector_alloc( &p, num*len);
  FUNCTION_FAILURE_HANDLE(err, vector_alloc, LOBPCGeig);

  TRY_MALLOC(VtAV, 3*num*3*num, double, LOBPCGeig);
  TRY_MALLOC(VtMV, 3*num*3*num, double, LOBPCGeig);
  TRY_MALLOC(basis_select, 3*num,  int, LOBPCGeig);

  /* init eigenvector approximations if necessary */
  if (use_initial_x != 1)
    {
      int seed[4]={ 123, 234, 345, 567 };
      err=vector_random(x, seed);
      FUNCTION_FAILURE_HANDLE(err, vector_random, LOBPCGeig);
    }

  if (projectr!=NULL)
    {
      
    }

  it=0;
  stop=0;
  while ((it<it_max)&&(!stop))
    {
      it++;

      /* to get residuals that are comparable to later ones
	 Mass-normalise the initial vectors
      */
      /* fprintf(stderr, "LOBPCGeig: DEBUG it=%3d scale ", it); */
      for (k=0; k<num; k++)
	{
	  double normXk2=0.0;
	  double scale;
	  
	  Xk.len=len;
	  Xk.V=&(*x).V[k*len];

	  if (projectr!=NULL)
	    {
	      err= (*projectr)( MatStif, &Xk, proj_dat, &Xk);
	      FUNCTION_FAILURE_HANDLE(err, projectr, LOBPCGeig);
	    }

	  /* compute Xk'*M*Xk */
	  err= (*multfuncMass)( MatMass, &Xk, &Mass_Xk);
	  FUNCTION_FAILURE_HANDLE(err, multfuncMass, LOBPCGeig);
	  for (i=0; i<len; i++)
	    {
	      normXk2 += Xk.V[i] * Mass_Xk.V[i];
	    }

	  scale=1.0/sqrt(normXk2);

	  basisVk.len=len;
	  basisVk.V  =&basisV.V[k*len];
	  for (i=0; i<len; i++)
	    {
	      basisVk.V[i] = scale * Xk.V[i];
	    }
	  /* fprintf(stderr,"DEBG %8.1e ", scale); /* */
	}
      if (it>1)
	{
	  for (k=0; k<num; k++)
	    {
	      double normXk2=0.0;
	      double scale;
	  
	      Xk.len=len;
	      Xk.V=&p.V[k*len];

	      if (projectr!=NULL)
		{
		  err= (*projectr)( MatStif, &Xk, proj_dat, &Xk);
		  FUNCTION_FAILURE_HANDLE(err, projectr, LOBPCGeig);
		}

	      /* compute Xk'*M*Xk */
	      err= (*multfuncMass)( MatMass, &Xk, &Mass_Xk);
	      FUNCTION_FAILURE_HANDLE(err, multfuncMass, LOBPCGeig);
	      for (i=0; i<len; i++)
		{
		  normXk2 += Xk.V[i] * Mass_Xk.V[i];
		}
	      
	      scale=1.0/sqrt(normXk2);

	      basisVk.len=len;
	      basisVk.V  =&basisV.V[(2*num+k)*len];
	      for (i=0; i<len; i++)
		{
		  basisVk.V[i] = scale * Xk.V[i];
		}
	      /* fprintf(stderr,"DEBG %8.1e ", scale); /* */
	    }
	}

      /* compute the eigenvalue approximations 
	 (Ritz-values) and preconditioned residuals, 
      */
      for (k=0; k<num; k++)
	{
	  double XkT_A_Xk=0.0;
	  double lamK;

	  Xk.len=len;
	  Xk.V=&basisV.V[k*len];
	  
	  /* compute Xk'*A*Xk */
	  err= (*multfuncStif)( MatStif, &Xk, &A_Xk);
	  FUNCTION_FAILURE_HANDLE(err, multfuncStif, LOBPCGeig);
	  for (i=0; i<len; i++)
	    {
	      XkT_A_Xk += Xk.V[i] * A_Xk.V[i];
	    }

	  /* since Xk'*M*Xk =1, XkT_A_Xk is now the Ritz value */
	  lamK=XkT_A_Xk;
	  (*lambda).V[k]=lamK;

	  /* compute M*Xk */
	  err= (*multfuncMass)( MatMass, &Xk, &Mass_Xk);
	  FUNCTION_FAILURE_HANDLE(err, multfuncMass, LOBPCGeig);
	  
	  /* compute resK = A_Xk   - lamK * Mass_Xk
	     .            = A * Xk - lamK *  M * Xk  */
	  for (i=0; i<len; i++)
	    {
	      resK.V[i] = A_Xk.V[i] - lamK*Mass_Xk.V[i];
	    }

	  /* precondition the residual */
	  precoK.len = len;
	  precoK.V   = &basisV.V[(k+num)*len];
	  if (prec_lambda_i!=NULL)
	    {
	      *prec_lambda_i = lamK;
	    }
	  if (precondi!=NULL)
	    {
	      err= (*precondi)( MatStif, &resK, prec_dat, &precoK);
	      FUNCTION_FAILURE_HANDLE(err, precondi, LOBPCGeig); 
	    }
	  else
	    {
	      /* no preconditioner, just copy */
	      for (i=0; i<len; i++)
		{
		  precoK.V[i]=resK.V[i];
		}
	    }
	  
	  if (projectr!=NULL)
	    {
	      err= (*projectr)( MatStif, &precoK, proj_dat, &precoK);
	      FUNCTION_FAILURE_HANDLE(err, projectr, LOBPCGeig);
	    }
	  //for (i=0; i<len; i++)  precoK.V[i] = resK.V[i];

	} /* for k */


      /* now compute VtAV and VtMV */
      if (it==1)
	{
	  basnum=2*num;
	}
      else
	{
	  basnum=3*num;
	}

      for (k=0; k<basnum; k++)
	{
	  Xk.len=len;
	  Xk.V=&basisV.V[k*len];

	  err= (*multfuncMass)( MatMass, &Xk, &Mass_Xk);
	  FUNCTION_FAILURE_HANDLE(err, multfuncMass, LOBPCGeig);
      	  err= (*multfuncStif)( MatStif, &Xk, &A_Xk);

	  for (j=0; j<=k; j++)
	    {
	      double XjT_A_Xk=0.0;
	      double XjT_M_Xk=0.0;

	      struct vector Xj;
	      Xj.len = len;
	      Xj.V   = &basisV.V[j*len];

	      for (i=0; i<len; i++)
		{
		  XjT_A_Xk += Xj.V[i] * A_Xk.V[i];
		  XjT_M_Xk += Xj.V[i] * Mass_Xk.V[i];
		}

	      VtAV[j*(3*num)+k] = XjT_A_Xk;
	      VtAV[k*(3*num)+j] = XjT_A_Xk;
	      VtMV[j*(3*num)+k] = XjT_M_Xk;
	      VtMV[k*(3*num)+j] = XjT_M_Xk;
	    }
	} /* for k */

      /* compute M-norm of preconditioned residual */
      resnorm=0.0;
      for (k=0; k<num; k++)
	{
	  double normPrecoK;

	  normPrecoK = VtMV[(num+k)*(3*num)+(num+k)];

	  fprintf(stderr,"DEBUG: res[%d]=%8.1e\n",k,sqrt(normPrecoK)); /* */

	  if (( k<num_chk  )&&( resnorm<sqrt(normPrecoK) ))
	    {
	      resnorm=sqrt(normPrecoK);
	    }
	} /* for k */
      if (it==1)
	{
	  iniresnorm=resnorm;
	}
      if (residual!=NULL)
	{
	  (*residual)=resnorm;
	}
      if (iter!=NULL)
	{
	  (*iter)=it;
	}
      
      switch (stoptype)
	{ 
	case 0:
	  stop=0;
	  break;
	case 1:
	  stop=(resnorm<=atol);
	  break;
	case 2:
	  stop=(resnorm<=rtol*iniresnorm);
	  break;
	case 3:
	  stop=((resnorm<=atol)&&(resnorm<=rtol*iniresnorm));
	  break;
	default:
	  fprintf(stderr, "LOBPCGeig: unknown stoptype=%d !\n", stoptype);
	  return FAIL;
	}

      /* fprintf(stderr,"\nDEBUG: VtAV \n");
	 for (k=0; k<3*num; k++)
	 {
	 for (j=0; j<3*num; j++)
	 fprintf(stderr," %e ",  VtAV[j*(3*num)+k]);
	 fprintf(stderr,"\n");
	 }
	 fprintf(stderr,"\nDEBUG: VtMV \n");
	 for (k=0; k<3*num; k++)
	 {
	 for (j=0; j<3*num; j++)
	 fprintf(stderr," %e ",  VtMV[j*(3*num)+k]);
	 fprintf(stderr,"\n");
	}
      */


      /* compute the new Ritz basis */
      {
	char fc_v, fc_u;
	int fi_one, fi_m_one, fi_3num, fi_basnum, info;
	double fd_ask_LWORK;

	fi_one    = 1; 
	fi_m_one  = -1; 
	fc_v      = 'V';
	fc_u      = 'U';
	fi_3num   = 3*num;
	fi_basnum = basnum; 
	

	/* compute eigenvalues and eigenvectors 
	   VtAV*x = lambda*VtMV*x */
	if (LWORK==-1)
	  {
	    /* first do a workspace query and allocate memory */
	    TRY_MALLOC(fd_lambda, 3*num, double, LOBPCGeig);

	    dsygv_( &fi_one, &fc_v, &fc_u, &fi_3num, 
		    VtAV, &fi_3num, VtMV, &fi_3num, fd_lambda,
		    &fd_ask_LWORK, &fi_m_one, &info );
	    if (info!=0)
	      {
		fprintf(stderr, "LOBPCGeig: "
			"LAPACK routine dsygv_ failed in workspace query "
			"info=%d \n", info);
		return FAIL;
	      }
	    LWORK=lrint(fd_ask_LWORK)+1;
	    TRY_MALLOC(fd_WORK, LWORK, double, LOBPCGeig);
	    /* fprintf(stderr,"DEBUG: LWORK set to %d\n", LWORK); */
	  }

	/* write VtAV and VtMV to files for debuging */
	/* {
	   FILE *debout;
	   fprintf(stderr,"writing eig debug output file\n");
	   debout=fopen("visual/ritzmat_A.txt","w");
	   if (debout==NULL) 
	   { 
	   fprintf(stderr,"failed to open debug output file A\n");
	   return FAIL;
	   }
	   for (i=0; i<basnum; i++)
	   {
	   for (j=0; j<basnum; j++)
	   {
	   fprintf(debout,"%23.16e  ", VtAV[j*(3*num)+i]);
	   }
	   fprintf(debout,"\n");
	   }
	   fclose(debout);
	   debout=fopen("visual/ritzmat_M.txt","w");
	   if (debout==NULL) 
	   { 
	   fprintf(stderr,"failed to open debug output file M\n");
	   return FAIL;
	   }
	   for (i=0; i<basnum; i++)
	   {
	   for (j=0; j<basnum; j++)
	   {
	   fprintf(debout,"%23.16e  ", VtMV[j*(3*num)+i]);
	   }
	   fprintf(debout,"\n");
	   }
	   fclose(debout);
	   fprintf(stderr,"done writing eig debug output file\n");
	   } /* */

	/* if necessary, remove vectors from the basis to make sure
	   VtMV is positive definite */
	{
	  int all_OK=0;
	  double *M_copy;
	  TRY_MALLOC(M_copy, basnum*3*num, double, LOBPCGeig);
	  
	  for(i=0; i<basnum; i++)
	    {
	      basis_select[i]=i;
	    }

	  while (!all_OK)
	    {
	      /* copy current VtMV */
	      for (i=0; i<basnum; i++)
		for (j=0; j<basnum; j++)
		  {
		    M_copy[i*3*num+j]=VtMV[i*3*num+j];
		  }

	      /* try to compute Cholesky factorization of M_copy */
	      fi_basnum=basnum;
	      dpotrf_( &fc_u, &fi_basnum, M_copy, &fi_3num, &info );
	      
	      if(info==0)
		{
		  all_OK=1;
		}
	      else
		{
		  if (info>0)
		    {
		      int remove=info-1;
		      fprintf(stderr,"LOBPCGeig: DEBUG "
			      "Cholesky factorisation of M_copy failed, "
			      "trying to remove %d-th vector from basis\n",
			      info);
		      for (i=remove; i<basnum-1; i++)
			for (j=0; j<basnum; j++)
			  {
			    VtMV[i*3*num+j]=VtMV[(i+1)*3*num+j];
			    VtAV[i*3*num+j]=VtAV[(i+1)*3*num+j];
			  }
		      for (i=remove; i<basnum-1; i++)
			for (j=0; j<basnum; j++)
			  {
			    VtMV[j*3*num+i]=VtMV[j*3*num+(i+1)];
			    VtAV[j*3*num+i]=VtAV[j*3*num+(i+1)];
			  }
		      for (i=remove; i<basnum-1; i++)
			{
			  basis_select[i]=basis_select[i+1];
			}
		      basnum--;
		    }
		  else
		    {
		      fprintf(stderr,"LOBPCGeig: "
			      "unknown error in call to dpotrf, info=%d\n",
			      info);
		      return FAIL;
		    }
		}
	    } /* end while(!all_OK) */
	  free(M_copy);
	} /* end block if necessary remove vectors from basis */

	/* the actual computation */
	dsygv_( &fi_one, &fc_v, &fc_u, &fi_basnum, 
		VtAV, &fi_3num, VtMV, &fi_3num, fd_lambda,
		fd_WORK, &LWORK, &info );
	if (info!=0)
	  {
	    fprintf(stderr, "LOBPCGeig: "
		    "LAPACK routine dsygv_ failed eigenvalue computation "
		    "info=%d \n", info);
	    return FAIL;
	  }

	/* DEBUG: output eigenvalue approximations */
	printf("LOBPCGeig: it=%3d ", it);
	for (i=0; i<num; i++)
	  {
	    printf("  %13.6e ", fd_lambda[i]);
	  }
	printf(" end resn=%8.1e\n",resnorm);

	/* now choose the first num small eigenvectors and compute
	   corresponding large eigenvector approximations,
	   VtAV holds the small eigenvectors
	*/
	for (k=0; k<num; k++)
	  {
	    double scale;

	    Xk.len=len;
	    Xk.V=&(*x).V[k*len];

	    /* first column of V */
	    basisVk.len=len;
	    basisVk.V=&basisV.V[basis_select[0]];
	    scale=VtAV[k*(3*num)+0];
	    for (i=0; i<len; i++)
	      {
		Xk.V[i]= scale * basisVk.V[i];
	      }

	    /* remaining columns of V */
	    for (j=1; j<basnum; j++)
	      {
		basisVk.len=len;
		basisVk.V=&basisV.V[basis_select[j]*len];
		scale=VtAV[k*(3*num)+j];
		for (i=0; i<len; i++)
		  {
		    Xk.V[i]+= scale * basisVk.V[i];
		  }
	      }
	  } /* end for k */
	/* now compute p, the differences between old and new
	   eigenvector approximations, 
	   still VtAV holds the small eigenvectors
	*/
	for (k=0; k<num; k++)
	  {
	    double scale;

	    Xk.len=len;
	    Xk.V=&p.V[k*len];

	    /* num+first column of V */
	    basisVk.len=len;
	    basisVk.V=&basisV.V[basis_select[num]*len];
	    scale=VtAV[k*(3*num)+num];
	    for (i=0; i<len; i++)
	      {
		Xk.V[i]= scale * basisVk.V[i];
	      }

	    /* remaining columns of V */
	    for (j=num+1; j<basnum; j++)
	      {
		basisVk.len=len;
		basisVk.V=&basisV.V[basis_select[j]*len];
		scale=VtAV[k*(3*num)+j];
		for (i=0; i<len; i++)
		  {
		    Xk.V[i]+= scale * basisVk.V[i];
		  }
	      }
	  } /* end for k */
      } /* end Fortran compatibility block, computation of new Ritz vectors */ 
    } /* while (!stop) */

  /* free temporary memory */
  free(fd_WORK);
  free(fd_lambda);
  free(VtMV);
  free(VtAV);
  free(basis_select);
  vector_free(&p);
  vector_free(&resK);
  vector_free(&A_Xk);
  vector_free(&Mass_Xk);
  vector_free(&basisV);


  return SUCCESS;
}
