/***********************************************************************

    This file is part of
    FEINS, Finite Element Incompressible Navier-Stokes solver,
    which is expanding to a more general FEM solver and toolbox,
    Copyright (C) 2003--2013, Rene Schneider 
    <rene.schneider@mathematik.tu-chemnitz.de>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program. If not, see <http://www.gnu.org/licenses/>.

    Minor contributions to this program (for example bug-fixes and
    minor extensions) by third parties automatically transfer the
    copyright to the general author of FEINS, to maintain the
    possibility of commercial re-licensing. If you contribute but wish
    to keep the copyright of your contribution, make that clear in
    your contribution!

    Non-GPL licenses to this program are available upon request from
    the author.

************************************************************************/
/*
FILE gen_aux.c
HEADER gen_aux.h

TO_HEADER:


// useful text macros
#include "feins_macros.h"
// data structures 
#include "sparse_struct.h"

*/




/* prototypes of external functions */
#include <math.h>
#include "mesh.h"
#include "elements.h"
#include "cubature.h"
#include "feins_lapack.h"


/* function prototypes */
#include "gen_aux.h"


/*FUNCTION*/
int gen_proj_bpx_tx(void *notused, struct vector *in,
		    void *arg3, struct vector *out
/* performs boundary condition projection and BPX preconditioning 
     out = P*C^{-1}*P^T in
   where C^{-1} is the BPX preconditioner sum(l=level)Q_l*D_l*Q_l^T 
   and P a project
   
   Input:  notused - well, it is not used but in the interface to
                     allow this function to be used as a
                     preconditioner
           in      - input vector
	   arg3=
           bpx     - struct bpxdata containing pointers to the mesh
                     a multilevel struct, a multilevel help vector,
		     and a projector1

   Output: out    - (given by reference), P*P^T* in

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
		 ){
  FIDX i, j, d;
  int err;
  FIDX vx_nr, hi_w, hi_nr, fath1, fath2, child, eg_w, eg_nr;
  FIDX level, level_old, lvlmax;
  FIDX ml_type, mldim;
  FIDX l_mctxhilvl, l_mctxhichld,  l_mctxhifat1;

  struct bpxdata       *bpx;
  struct mesh          *m;
  struct multilvl      *ml;
  struct projector1    *P;

  bpx    = arg3;
  m      = (*bpx).msh;
  ml     = (*bpx).ml;
  lvlmax = (*ml).lmax;
  ml_type= (*ml).type;
  mldim  = (*ml).dim;
  P      = (*bpx).P;

  vx_nr  = (*m).vx_nr;
  eg_nr  = (*m).eg_nr;
  eg_w   = (*m).eg_w;
  hi_nr  = (*m).hi_nr;
  hi_w   = (*m).hi_w;

  switch (ml_type)
    {
    case 1:
      l_mctxhilvl  = MCT1HILVL;
      l_mctxhichld = MCT1HICHLD;
      l_mctxhifat1 = MCT1HIFAT1;
      break;
    case 2:
      l_mctxhilvl  = MCT2HILVL;
      l_mctxhichld = MCT2HICHLD;
      l_mctxhifat1 = MCT2HIFAT1;
      break;
    default:
      fprintf(stderr,"gen_proj_bpx_tx: unknown type=%d\n",
	      (int) ml_type);
      return FAIL;
    }
    

  if ( (((*in).len!=(*out).len)||((*in).len!=vx_nr*mldim))||
       (*ml).nlevl[(*ml).lmax]-(*ml).nlevl[(*ml).lmax+1]!=vx_nr*mldim)
    {
      fprintf(stderr,
	      "gen_proj_bpx_tx: dimensions don't make sense!\n");
      return FAIL;
    }

  /* copy in to hvec */
  for (i=0; i<(*in).len; i++)
    {
      (*bpx).hvec[i]=(*in).V[i];
    }

  if (P!=NULL)
    {
      /* apply P^T */
      for (i=0; i<(*P).len; i++)
	{
	  (*bpx).hvec[ (*P).V[i] ]   = 0.0 ;
	}
    }

  /* calculate r_l = Q_l^T r = [I P_l^T] r_(l+1), r_lmax=r for all
     levels l, r_l is stored in hvec[ nlevl[l+1]...nlevl[l]-1 ], */
  level_old=(*ml).lmax+1;
  for (i=hi_nr-1; i>=0; i--)
    {
      level=(*m).hier[i*hi_w+l_mctxhilvl]+1;
      if (level<=0) printf("error, level=%d\n", (int) level);
      if (level_old!=level)
	{
	  FIDX l_vx_nr=((*ml).nlevl[level]-(*ml).nlevl[level+1])/mldim;
      
	  /* initialise the new (coarser) level */
	  for (j=0; j<l_vx_nr; j++)
	    for (d=0; d<mldim; d++)
	      {
		/* get the entry number of the coarser level */
		child = j+d*vx_nr;
		MLVLFINDENTRY(fath2, child, level-1, *ml);
		if (fath2>=0) 
		  {
		    MLVLFINDENTRY(child, child, level, *ml);
		    /* the I (identity) part, r_l = r_(l+1) */
		    (*bpx).hvec[fath2]=(*bpx).hvec[child];
		  }
	      }
	  level_old=level;
	}

      for (d=0; d<mldim; d++)
	{
	  child=(*m).hier[i*hi_w+l_mctxhichld  ]+d*vx_nr;
	  fath1=(*m).hier[i*hi_w+l_mctxhifat1  ]+d*vx_nr;
	  fath2=(*m).hier[i*hi_w+l_mctxhifat1+1]+d*vx_nr;

	  MLVLFINDENTRY(fath1, fath1, level-1, *ml);
	  MLVLFINDENTRY(fath2, fath2, level-1, *ml);
	  MLVLFINDENTRY(child, child, level,   *ml);

	  //printf("i=%d   d=%d  f1=%d f2=%d c=%d\n",i,d,fath1,fath2,child);

	  /* the P_l^T part */
	  (*bpx).hvec[fath1] += 0.5*(*bpx).hvec[child];
	  (*bpx).hvec[fath2] += 0.5*(*bpx).hvec[child]; 
	}
    }


#ifdef DO_PROJ_ALL_LVL_BPX
  if (P!=NULL)
    {
      /* perform the boundary projection on all levels */
      for (i=0; i<(*P).len; i++)
	{
	  FIDX node, nodel;
	  node=(*P).V[i];
	  for (j=0; j<=(*ml).lmax; j++)
	    {
	      MLVLFINDENTRY(nodel, node, j, *ml);
	      if (nodel>=0)
		{
		  (*bpx).hvec[nodel]=0.0;
		}
	    }
	}
    }
#endif



  /* calculate w_l = [I ; P_l] w_(l-1) + D_l r_l, w_0=K_0^(-1) r_0 */
  /* initialise the coarsest level, 
     appy the coarse grid solver, if available,
     otherwise nothing has to be done in 2d as D_0 = I */
  if ((*bpx).cmat!=NULL) 
    {
      FIDX dof;
      struct vector rhs_x;

      /* here we suppose the coarse grid nodes have consecutive
	 numbers and start from 0 */
      dof=0;
      MLVLFINDENTRY(dof, dof, 0, *ml);

      rhs_x.V=&(*bpx).hvec[dof];
      rhs_x.len=(*ml).nlevl[0]-(*ml).nlevl[1];
      rhs_x.n_max=rhs_x.len;

      /* the dirichlet dofs are set to zero by the above projection */
#ifdef FEINS_have_warning 
#warning "requires projection of residual first?"
#endif

      /* coarse grid solve */
      err=coarse_mat_solve( (*bpx).cmat, NoTrans, &rhs_x, &rhs_x);
      FUNCTION_FAILURE_HANDLE(err, coarse_mat_solve,
			      stokes_projector_part_bpx);

      /* the solution is in w_0 */
    }

  /* calculate w_l = [I ; P_l] w_(l-1) + D_l r_l, */
  level_old=0; 
  for (i=0; i<hi_nr; i++)
    {
      level=(*m).hier[i*hi_w+l_mctxhilvl]+1;

      if (level_old!=level)
	{
	  FIDX lm1_vx_nr=((*ml).nlevl[level-1]-(*ml).nlevl[level])/mldim;
      
	  /* initialise the new (finer) level */
	  for (j=0; j<lm1_vx_nr; j++)
	    for (d=0; d<mldim; d++)
	      {
		/* get the entry number of the coarser level */
		fath2 = j+d*vx_nr;
		MLVLFINDENTRY(child, fath2, level, *ml);
		MLVLFINDENTRY(fath2, fath2, level-1, *ml);
		/* the I (identity) part */
		(*bpx).hvec[child]+=(*bpx).hvec[fath2];
	      }
	  level_old=level;
	}
      
      for (d=0; d<mldim; d++)
	{
	  child=(*m).hier[i*hi_w+l_mctxhichld  ]+d*vx_nr;
	  fath1=(*m).hier[i*hi_w+l_mctxhifat1  ]+d*vx_nr;
	  fath2=(*m).hier[i*hi_w+l_mctxhifat1+1]+d*vx_nr;
      
	  MLVLFINDENTRY(fath1, fath1, level-1, *ml);
	  MLVLFINDENTRY(fath2, fath2, level-1, *ml);
	  MLVLFINDENTRY(child, child, level,   *ml);
      
	  /* the P_(l-1) part */
	  (*bpx).hvec[child]+=0.5*( (*bpx).hvec[fath1] 
				    + (*bpx).hvec[fath2] );
	}
    }

  /* copy to out */
  for (i=0; i<(*out).len; i++)
    {
      (*out).V[i]=(*bpx).hvec[i];
    }

  if (P!=NULL)
    {
      /* apply P */
      for (i=0; i<(*P).len; i++)
	{
	  (*out).V[ (*P).V[i] ]   = 0.0 ;
	}
    }

  /* done */
  return SUCCESS;
}


/*FUNCTION*/
int gen_proj_MG_tx(void *arg1, struct vector *in,
		   void *arg3, struct vector *out
/* performs the boudary condition projection and multigrid
   preconditioning, 
     out = P*C^-1*P^T * in
   where P projects the velocity components of boundary nodes to zero,
   (such that addition of a projected vector doesn't change the
   velocity there), and C^-1 repressents the action of the multigrid
   preconditioning, that is one V cycle
   
   Input:  arg1    - not used, in the interface for compatibility reasons
           in      - input vector
	   arg3=
	   mg      - mgdata struct, containing everything needed for
	             the projection and multigrid

   Output: out    - (given by reference), P*C^-1P^T* in

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
		 ){
  FIDX i, j, len;
  int err;
  FIDX dim, vx_nr;

  struct vector xi, bi, invdiagi, dxi;

  struct mgdata *mg;
  struct mesh *m;
  struct multilvl *ml;

  struct sparse *Ks;
  struct projector1 *P;

  double *xl, *dxl, *bl, *invdiag;

  double alpha;

  FIDX lvl, lmax, lmin, lvl_vx, smooths, vcycles, vccount;


  mg     = arg3;

  m      = (*mg).msh;
  ml     = (*mg).ml;

  Ks     = (*mg).Ks;
  P      = (*mg).P;

  dim    = (*ml).dim;
  vx_nr  = (*mg).vx_nr;
  len    = dim * vx_nr;

  xl      = (*mg).xl;
  dxl     = (*mg).dxl;
  bl      = (*mg).bl;
  invdiag = (*mg).invdiag;

  smooths  = (*mg).smooths;
  vcycles  = (*mg).vcycles;
  alpha    = (*mg).CGC_scale;

  if ((len!=(*in).len)||(len!=(*out).len))
    {
      fprintf(stderr,
	      "gen_proj_MG_tx: in or out length doesn't match\n");
      return FAIL;
    }


  lmax = (*ml).lmax;

  lmin=0; 


  /* copy in to bl */
  for (i=0; i<(*in).len; i++)
    {
      bl[i]=(*in).V[i];
    }

  /* apply P^T */
  for (i=0; i<(*P).len; i++)
    {
      bl[ (*P).V[i] ] = 0.0 ;
    }

  /* apply C^-1 */

  if (vcycles<=0)
    {
      fprintf(stderr, "gen_proj_MG_tx: vcycles<=0 not defined\n");
      return FAIL;
    }
 

  /* perform vcycles V-cycles */
  for (vccount=0; vccount<vcycles; vccount++)
    {
      /* V-cycle downward */
      for (lvl=lmax; lvl>=lmin; lvl--)
	{ 
	  lvl_vx=(*ml).nlevl[lvl]-(*ml).nlevl[lvl+1];
	  
	  if ((vccount==0)||(lvl<lmax))
	    {
	      /* set xl=0 */
	      for (j=(*ml).nlevl[lvl+1]; j<(*ml).nlevl[lvl]; j++)
		xl[j]=0.0;
	    }
	  
	  /* make x_l, b_l, invdiag_l accessible as vector */
	  xi.len=lvl_vx;
	  xi.n_max=lvl_vx;
	  xi.V=&xl[(*ml).nlevl[lvl+1]];

	  bi.len=lvl_vx;
	  bi.n_max=lvl_vx;
	  bi.V=&bl[(*ml).nlevl[lvl+1]];

	  invdiagi.len=lvl_vx;
	  invdiagi.n_max=lvl_vx;
	  invdiagi.V=&invdiag[(*ml).nlevl[lvl+1]];

	  if ((lvl==0)&&((*mg).cmat!=NULL))
	    {
	      /* use the coarse grid solver */
	      struct vector bcoarse;
	      bcoarse.len=lvl_vx;
	      bcoarse.n_max=lvl_vx;
	      bcoarse.V=&bl[(*ml).nlevl[lvl+1]];

	      /* projection */
	      for (i=0; i<(*P).len; i++)
		{
		  FIDX dof, child;
		  dof=(*P).V[i];
		  MLVLFINDENTRY(child, dof, lvl, *ml);
		  if (child>=0)
		    {
		      bl[child]=0.0;
		    }
		}


	      err=coarse_mat_solve( (*mg).cmat, NoTrans,
				    &bcoarse, &xi);
	      FUNCTION_FAILURE_HANDLE( err, coarse_mat_solve,
				       gen_proj_MG_tx);    
	    }
	  else
	    {
	      /* do smooths Gauss-Seidel sweeps forward */
	      err=sparse_GS_sweep_fwd( &Ks[lvl], &bi,
				       &invdiagi, smooths, &xi);
	      FUNCTION_FAILURE_HANDLE( err, sparse_GS_sweep_fwd,
				       gen_proj_MG_tx);
		    
	      if (lvl==0)
		{
		  /* do smooths Gauss-Seidel sweeps backward */
		  err=sparse_GS_sweep_bwd( &Ks[lvl], &bi,
					   &invdiagi, smooths, &xi);
		  FUNCTION_FAILURE_HANDLE( err, sparse_GS_sweep_bwd,
					   gen_proj_MG_tx);/* */
		}
	    }

	  
	  if (lvl>0)
	    {
	      /* compute the residual on this lvl */
	      /* compute the matrix vector product,
		 dxl=K*xl */
	      dxi.V   = &dxl[(*ml).nlevl[lvl+1]];
	      dxi.len = lvl_vx;
	      dxi.n_max = lvl_vx;
	      err=sparse_mul_mat_vec( &Ks[lvl], &xi, &dxi);
	      FUNCTION_FAILURE_HANDLE( err, sparse_mul_mat_vec,
				       gen_proj_MG_tx);

	      /* now change dxl to rl=bl-K*xl */
	      for (j=(*ml).nlevl[lvl+1]; j<(*ml).nlevl[lvl]; j++)
		dxl[j]=bl[j]-dxl[j];

	      /* restrict rl to r_l-1 */
	      err=mg_restrict_tx( mg, lvl, lvl-1, dxl);
	      FUNCTION_FAILURE_HANDLE( err, mg_restrict_tx,
				       gen_proj_MG_tx);
	      /* copy r_l-1 to b_l-1 */
	      for (j=(*ml).nlevl[lvl]; j<(*ml).nlevl[lvl-1]; j++)
		bl[j]=dxl[j];
	    }
	} /* end V-cycle downward */

      /* V-cycle upward */
      for (lvl=lmin+1; lvl<=lmax; lvl++)
	{ 
	  lvl_vx=(*ml).nlevl[lvl]-(*ml).nlevl[lvl+1];

	  /* apply the update computed in the lower level */
	  /* copy the update to dx */
	  for (j=(*ml).nlevl[lvl]; j<(*ml).nlevl[lvl-1]; j++)
	      dxl[j]=xl[j];


	  /* interpolate dx to lvl */
	  err=mg_interpolate_tx( mg, lvl-1, lvl, dxl);
	  FUNCTION_FAILURE_HANDLE( err, mg_interpolate_tx,
				   gen_proj_MG_tx);


	  /* apply the update to xl */
	  for (j=(*ml).nlevl[lvl+1]; j<(*ml).nlevl[lvl]; j++)
	    xl[j]+=alpha*dxl[j];

	  /* make x_l, b_l, invdiag_l accessible as vector */
	  xi.len=lvl_vx;
	  xi.n_max=lvl_vx;
	  xi.V=&xl[(*ml).nlevl[lvl+1]];

	  bi.len=lvl_vx;
	  bi.n_max=lvl_vx;
	  bi.V=&bl[(*ml).nlevl[lvl+1]];

	  invdiagi.len=lvl_vx;
	  invdiagi.n_max=lvl_vx;
	  invdiagi.V=&invdiag[(*ml).nlevl[lvl+1]];
	  
	  /* do smooths Gauss-Seidel sweeps backward */
	  err=sparse_GS_sweep_bwd( &Ks[lvl], &bi,
				   &invdiagi, smooths, &xi);
	  FUNCTION_FAILURE_HANDLE( err, sparse_GS_sweep_bwd,
				   gen_proj_MG_tx);
	} /* end V-cycle upward */
    } /* end V-cycles loop */
  
  (*mg).vccount+=vccount;



  /* copy xl to out */
  for (i=0; i<(*out).len; i++)
    {
      (*out).V[i]=xl[i];
    }

  /* apply P */
  for (i=0; i<(*P).len; i++)
    {
      (*out).V[ (*P).V[i] ] = 0.0 ;
    }

  /* done */

  return SUCCESS;
}

/* inline functions */
#include "assembly_inline_funs.h"

/*FUNCTION*/
int gen_zz_interpolate_gradient_tx(struct mesh *m,
				   int type,
				   FIDX ncomp,
				   struct vector *u,
				   struct vector *gradu
/* averages the values of the gradient (or the jacobian) at the nodes
   of the mesh, thus interpolation of these provides smoothed
   gradients.
   this idea is for example used in the ZZ type error estimators

   ideally this will be done according to 

   O.C. Zienkiewicz and J.Z. Zhu, The Superconvergent patch recovery
   and a posteriori error estimators. Part 1. The recovery technique,
   Int. J. Numer. Methods Eng., 33, 1331-1364 (1992)

   but for now it is only a simplified version of this idea.

   
   Input:  m       - the mesh, has to be T1 or T2 
           type    - defining which type of mesh this actually is
	             type=1 ==> linear,  type=2 quadratic triangles
           ncomp   - number of (solution) components in u, i.e. number
                     of variables per node in the mesh
	   u       - coefficient vector representing a function on
	             the current mesh (e.g. solution of PDE problem)
		     u.V[r*m->vx_nr+i] has to be the value of the r-th
		     component of the (possibly) vector valued
		     function u at node i of the mesh

   Output: gradu   - the gradient (or Jacobian) of u averaged at the
                     nodes of the mesh, has to be of size
                     dim*ncomp*vx_nr,
		     gradu.V[d*ncomp*vx_nr + r*vx_nr +i]
		     will be the averaged value of the partial derivative
		     wrt. to the d-th coordinate of the r-th component
		     of function u at the i-th node of the mesh

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
		 ){
  FIDX i, j, r, el, pt, cmp;
  int err;

  FIDX dim, basn, bas_n, vx_nr, el_w, vx_w;

  enum elemtype eltype;

  double points_t1[2*3]={ 0.0, 0.0,
			  1.0, 0.0,
			  0.0, 1.0 };
  double points_t2[2*6]={ 0.0, 0.0,
			  1.0, 0.0,
			  0.0, 1.0,
			  0.5, 0.0,
			  0.5, 0.5,
			  0.0, 0.5 };
  double points_e1[4*4]={ 0.0, 0.0, 0.0,
			  1.0, 0.0, 0.0,
			  0.0, 1.0, 0.0,
			  0.0, 0.0, 1.0 };
  double *points;


  /* for FORTRAN calls: */
  char   fNoTrans='N';
  double done=1.0, dzero=0.0;
  int fdim, fbas_n;

  double *Jac, *Jacinv;  /* Jacobian of the element mapping and its
			    inverse */
  FIDX   *ldofs;         /* list of local dofs */
  double *Jinvgrad;      /* inverse Jacobian times gradphi */
  double detJac;         /* determinant of the Jacobian */
  double AdetJac;        /* abs(detJac) */
  double *phi, *gradp, *hessphi;
  double *lgradu;         /* the local gradient of u */

  double *weight_count;  /* summs the weights of all contributions to
			    each node, (for now this will mean counts
			    the number of elements the node belongs to */

  FIDX l_mctxELNOD1, l_mctxVXSTRT;

  vx_nr = m->vx_nr;
  dim   = m->dim;
  vx_w  = m->vx_w;
  el_w  = m->el_w;

  if ((*u).len!=ncomp*vx_nr)
    {
      fprintf(stderr,
	      "gen_zz_interpolate_gradient_tx: "
	      "size of u does not match ncomp*vx_nr\n");
      return FAIL;
    }
  
  if ((*gradu).len!=dim*ncomp*vx_nr)
    {
      fprintf(stderr,
	      "gen_zz_interpolate_gradient_tx: "
	      "size of gradu does not match dim*ncomp*vx_nr\n");
      return FAIL;
    }

  switch (type)
    {
    case 1:
      switch (dim)
	{
	case 2:
	  l_mctxVXSTRT = MCT1VXSTRT;
	  l_mctxELNOD1 = MCT1ELNOD1;
	  points = points_t1;
	  basn   = 3;
	  eltype = tria;
	  break;
	case 3:
	  l_mctxVXSTRT = MCE1VXSTRT;
	  l_mctxELNOD1 = MCE1ELNOD1;
	  points = points_e1;
	  basn   = 4;
	  eltype = tetra;
	  break;
	default:
	  fprintf(stderr,"gen_zz_interpolate_gradient_tx: unknown dim=%d\n",
		  dim);
	  return FAIL;
	}
      break;
    case 2:
      l_mctxVXSTRT = MCT2VXSTRT;
      l_mctxELNOD1 = MCT2ELNOD1;
      points = points_t2;
      basn   = 6;
      eltype = tria;
      break;
    default:
      fprintf(stderr,"gen_zz_interpolate_gradient_tx: unknown type=%d\n",type);
      return FAIL;
    }

  /* init */
  /* initialise interpolation info  */
  err= eval_basis( dim, eltype, type, basn, points,
		   &bas_n, &phi, &gradp, &hessphi);
  FUNCTION_FAILURE_HANDLE( err, eval_basis, gen_zz_interpolate_gradient_tx);
  free( hessphi );

  if (basn!=bas_n)
    {
      fprintf(stderr,"gen_zz_interpolate_gradient_tx: basn!=bas_n\n");
      return FAIL;
    }

  TRY_MALLOC( weight_count, vx_nr, double, gen_zz_interpolate_gradient_tx);

  /* allocate memory for Jac, Jacinf */
  TRY_MALLOC( Jac, dim*dim, double, gen_zz_interpolate_gradient_tx);
  TRY_MALLOC( Jacinv, dim*dim, double, gen_zz_interpolate_gradient_tx);
  /* allocate memory for Jinvgrad */
  TRY_MALLOC( Jinvgrad, dim*bas_n, double, gen_zz_interpolate_gradient_tx);
  /* allocate memory for ldofs */
  TRY_MALLOC( ldofs, ncomp*bas_n, FIDX, gen_zz_interpolate_gradient_tx);
  TRY_MALLOC( lgradu, dim, double, gen_zz_interpolate_gradient_tx);

  /* seto gradu to zero */
  for(i=0; i<gradu->len; i++)
    {
      gradu->V[i]=0.0;
    }

  /* seto weight_count to zero */
  for(i=0; i<vx_nr; i++)
    {
      weight_count[i]=0.0;
    }

  /* loop over all elements */
  for (el=0; el<m->el_nr; el++)
    {
      /* define ldofs */
      for(i=0; i<ncomp; i++)
	for(j=0; j<bas_n; j++)
	  {
	    ldofs[i*bas_n+j]=i*vx_nr + m->elem[el*el_w+ l_mctxELNOD1 +j];
	  }

      /* loop over the nodes of the element */
      for(pt=0; pt<bas_n; pt++)
	{
	  /* evaluate the gradient at this point */
	  /* Jac=0 */
	  for (i=0; i<dim*dim; i++)
	    Jac[i]=0.0;
	  
	  /* Jac = sum_{i=nodes} vertex(i)*gradphi_i^T */
	  for (i=0;i<bas_n; i++)
	    {
	      for (j=0; j<dim; j++)
		for (r=0; r<dim; r++)
		  {
		    Jac[j*dim+r]+= 
		      m->vertex[ldofs[i]*vx_w + l_mctxVXSTRT+j]
		      * gradp[pt*bas_n*dim +i*dim +r];
		  }
	    }

	  /* get detJac and Jacinv */
	  err=matrix_dim_x_dim_invert(dim, Jac,dim, Jacinv,dim, &detJac);
	  FUNCTION_FAILURE_HANDLE( err, matrix_dim_x_dim_invert,
				   gen_zz_interpolate_gradient_tx);

	  AdetJac = fabs(detJac);


	  /* real world gradient, not constant for quadratic elements */
	  /* Jinvgrad= Jacinv * gradphi[k,:,:]  */
	  fdim=dim;
	  fbas_n=bas_n;
	  dgemm_( &fNoTrans, &fNoTrans, &fdim, &fbas_n, &fdim, 
		  &done, Jacinv, &fdim, &(gradp[pt*bas_n*dim]), &fdim,
		  &dzero, Jinvgrad, &fdim );

	  /* loop over all components of u */
	  for(cmp=0; cmp<ncomp; cmp++)
	    {
	      /* compute the gradient of the component of u */
	      for (i=0; i<dim; i++) 
		{
		  lgradu[i] = 0.0;
		}

	      /* gradu = sum u_j * gradphi_j */
	      for (r=0; r<dim; r++) /* derivate wrt x or y */
		for (i=0; i<bas_n; i++)
		  {
		    lgradu[r] += 
		      u->V[ldofs[cmp*bas_n+i]]*Jinvgrad[i*dim+r];
		  }

	      /* add this to gradu */
	      for (r=0; r<dim; r++) /* derivate wrt x or y */
		{
		  gradu->V[r*ncomp*vx_nr + cmp*vx_nr +ldofs[pt]]
		    +=lgradu[r];
		}
	    } /* end loop over components */
	      
	  weight_count[ldofs[pt]]+=1.0;

	} /* end loop over local nodes */

    } /* end loop over all elements */


  /* now divide by accumulated the weights */
  for(r=0; r<dim; r++)
    for(cmp=0; cmp<ncomp; cmp++)
      for(i=0; i<vx_nr; i++)
	{
	  /* printf("r=%d cmp=%d i=%3d gradu=%e weight=%e\n",
	     r, cmp, i, gradu->V[r*ncomp*vx_nr + cmp*vx_nr +i],
	     weight_count[i]); */
	  gradu->V[r*ncomp*vx_nr + cmp*vx_nr +i]/=weight_count[i];
	}

  free(lgradu);
  free(ldofs);
  free(Jinvgrad);
  free(Jacinv);
  free(Jac);
  free(weight_count);
  free(gradp);
  free(phi);

  return SUCCESS;
}





/*FUNCTION*/
int gen_error_est_ZZ_tx(struct mesh *m, FIDX ncomp, struct vector *uh,
			FIDX *marker, FIDX *nrmarked, double *globest,
			struct solver_settings *set, int type
/* estimates the interpolation error for a finite element function in
   the H1-semi-norm using the technique of Zienkiewicz and Zhu's
   superconvergent patch recovery, 


   a simplified approach similar to  
   [
   O.C. Zienkiewicz and J.Z. Zhu, The Superconvergent patch recovery
   and a posteriori error estimators. Part 1. The recovery technique,
   Int. J. Numer. Methods Eng., 33, 1331-1364 (1992)
   
   O.C. Zienkiewicz and J.Z. Zhu, The Superconvergent patch recovery
   and a posteriori error estimators. Part 2. Error estimates and
   adaptivity, Int. J. Numer. Methods Eng., 33, 1365-1382 (1992)
   ]


   Input:  m           - the mesh
           ncmop       - the number of components in uh (per node of
                         the mesh)
	   uh	       - coefficient vector representing a function on
	                 the current mesh (e.g. solution of PDE problem)
			 u.V[r*m->vx_nr+i] has to be the value of the r-th
			 component of the (possibly) vector valued
			 function u at node i of the mesh
	   set	       - holding the settings for the process
           type        - defining which type of mesh this actually is
	                 type=1 ==> linear    triangles
			 type=2 ==> quadratic triangles

   Output: marker      - vector of length msh.el_nr with
                         marker[i]==1 -> element i isto be refined,
           nrmarked    - number of marked elements, i.e. number of
                         non-zero entries in marker (given by reference)
           globest     - value of the global estimate (given by reference)

   Return: SUCCESS     - success
           FAIL        - failure, see error message, output will not be
	                 valid
*/
		    ){
			    
  FIDX el,i,j,k,r,cmp;
  int err;

  FIDX dim, bas_n, dim_bas_n, order,  
    vx_nr, el_w, vx_w, el_nr;

  FIDX subtypes;

  struct int_data iform; /* integration formula    */

  /* for FORTRAN calls: */
  char   fNoTrans='N';
  double done=1.0, dzero=0.0;
  int fdim, fbas_n;
    
  double *Jac, *Jacinv;  /* Jacobian of the element mapping and its
			    inverse */
  double *Jinvgrad;      /* real world gradient = inverse Jacobian times gradphi */
  double detJac;         /* determinant of the Jacobian */
  double AdetJac;        /* abs(detJac) */
  double *phi, *gradp;
  FIDX   *ldofs;         /* local dofs */

  struct vector nk; 	 /* vector containing the squares of the local error 
			    estimates */

  FIDX l_mctxELNOD1, l_mctxVXSTRT;

  struct vector gradu_interp;

  enum elemtype eltype;

  dim=(*m).dim;

  switch (type)
    {
    case 1:
      switch (dim)
	{
	case 2:
	  l_mctxVXSTRT = MCT1VXSTRT;
	  l_mctxELNOD1 = MCT1ELNOD1;
	  eltype = tria;
	  order=4;
	  break;
	case 3:
	  l_mctxVXSTRT = MCE1VXSTRT;
	  l_mctxELNOD1 = MCE1ELNOD1;
	  eltype = tetra;
	  order = 2;
	  break;
	default:
	  fprintf( stderr,
		   "gen_error_est_ZZ_tx: dim=%d is not supported\n",
		   dim);
	  return FAIL;
	}
      break;
    case 2:
      l_mctxVXSTRT = MCT2VXSTRT;
      l_mctxELNOD1 = MCT2ELNOD1;
      eltype = tria;
      order=7;
      if (dim!=2)
	{
	  fprintf( stderr,
		   "gen_error_est_ZZ_tx type=2 is only implemented for 2D meshes!\n");
	  return FAIL;
	}			    
      break;
    default:
      fprintf(stderr,"gen_error_est_ZZ_tx: unknown type=%d\n",type);
      return FAIL;
    }

  
  err=vector_alloc( &gradu_interp, dim*(uh->len)); 
  FUNCTION_FAILURE_HANDLE( err, vector_alloc, gen_error_est_ZZ_tx);


  err=gen_zz_interpolate_gradient_tx(m, type, ncomp, uh, &gradu_interp); 
  FUNCTION_FAILURE_HANDLE( err, gen_zz_interpolate_gradient_tx, gen_error_est_ZZ_tx);
  

  subtypes=type;

  err=cubature_bases( dim, order, eltype, 1, &subtypes, &iform); 
  FUNCTION_FAILURE_HANDLE( err, cubature_bases, gen_error_est_ZZ_tx);

  /* make phi and gradphi better accessible */
  phi   = (iform.bases[0]->phi);
  gradp = (iform.bases[0]->gradphi);
  bas_n = (iform.bases[0]->num_basis);

  /* for FORTRAN calls */
  fdim    = (int) dim;
  fbas_n  = (int) bas_n;
  /* for easier accces */
  vx_nr = (*m).vx_nr;
  el_nr = (*m).el_nr;
  vx_w  = (*m).vx_w;
  el_w  = (*m).el_w;
  dim_bas_n = dim*bas_n;
    
  /* allocate memory for gradients, hessian, jacobians,... */    
  TRY_MALLOC( Jac, dim*dim, double, gen_error_est_ZZ_tx);
  TRY_MALLOC( Jacinv, dim*dim, double, gen_error_est_ZZ_tx); 
  TRY_MALLOC( Jinvgrad, dim_bas_n, double, gen_error_est_ZZ_tx); 
  TRY_MALLOC( ldofs, ncomp*bas_n, FIDX, gen_error_est_ZZ_tx);

  err=vector_alloc( &nk, el_nr);
  FUNCTION_FAILURE_HANDLE( err, vector_alloc, gen_error_est_ZZ_tx);

  for (i=0; i<el_nr; i++) 
    {
      marker[i] = 0;		 
      nk.V[i]   = 0.0;
    }
  
  /* Loop over all elements */    
  for (el=0; el<el_nr; el++)
    {  
      /* define ldofs */
      for(i=0; i<ncomp; i++)
	for(j=0; j<bas_n; j++)
	  {
	    ldofs[i*bas_n+j]=i*vx_nr + m->elem[el*el_w+ l_mctxELNOD1 +j];
	  }
      
      /* loop over integration points */
      for(k=0; k<iform.num_points; k++)
	{
	  /* compute the Jacobian */
	  for (i=0; i<dim*dim; i++) Jac[i]=0.0;  
	  /* Jac = sum_{i=nodes} vertex(i)*gradphi_i^T */
	  for (i=0; i<bas_n; i++)
	    for (j=0; j<dim; j++)
	      for (r=0; r<dim; r++)
		{
		  Jac[j*dim+r]+= 
		    (*m).vertex[(*m).elem[el*el_w+l_mctxELNOD1+i]
				*vx_w +l_mctxVXSTRT+j]
		    *gradp[k*bas_n*dim+i*dim +r];
		}

	  /* get detJac and Jacinv */
	  err=matrix_dim_x_dim_invert(dim, Jac,dim, Jacinv,dim, &detJac);
	  FUNCTION_FAILURE_HANDLE( err, matrix_dim_x_dim_invert,
				   gen_zz_interpolate_gradient_tx);

	  AdetJac = fabs(detJac);
	
	  /* real world gradient, not constant for quadratic elements */
	  /* Jinvgrad= Jacinv * gradphi[k,:,:]  */
	  dgemm_( &fNoTrans, &fNoTrans, &fdim, &fbas_n, &fdim, 
		  &done, Jacinv, &fdim, &(gradp[k*bas_n*dim]), &fdim,
		  &dzero, Jinvgrad, &fdim );


	  /* nk[el]=integral (grad(uh) - gradu_interpolated)^2 */
	  for (r=0; r<dim; r++)
	    for (cmp=0; cmp<ncomp; cmp++)
	      {
		double graduh, graduh_interpolated, diff_grads;

		graduh              = 0.0;
		graduh_interpolated = 0.0;

		for (i=0; i<bas_n; i++)
		  {
		    graduh
		      += uh->V[ldofs[cmp*bas_n+i]]*Jinvgrad[i*dim+r];

		    graduh_interpolated 
		      += (gradu_interp.V[r*ncomp*vx_nr+ldofs[cmp*bas_n+i]]
			  *phi[k*bas_n +i]);
		  }
		
		diff_grads = graduh - graduh_interpolated;

		/* printf("el=%3d k=%2d r=%d cmp=%d  graduh=%e  g_interp=%e\n", 
		   el, k, r, cmp, graduh, graduh_interpolated); */

		nk.V[el]+= diff_grads*diff_grads * iform.weights[k]*AdetJac;
	      }

	} /* end loop over integration points */
      /* printf("el=%3d   nk=%e\n", el, nk.V[el]); */
    } /* end loop over elements */

  /* Finally, decide which elements are to marked */      
  printf("gen_error_est_ZZ_tx->");
  err=gen_error_est_marker(m, &nk, marker, nrmarked, globest, set);
  FUNCTION_FAILURE_HANDLE( err, gen_error_est_marker, gen_error_est_ZZ_tx);

  vector_free(&nk);
  vector_free(&gradu_interp);
     

  free(ldofs);
  free(Jac);
  free(Jacinv);
  free(Jinvgrad);

  free_intdata (&iform);

    
  return SUCCESS;
}





/*FUNCTION*/
int gen_error_est_marker(struct mesh *m, struct vector *nk,
			 FIDX *marker, FIDX *nrmarked, double *globest,
			 struct solver_settings *set
/* marks the elements based on the element wise error estimates nk


   Input:  m           - the mesh
           nk          - the element wise error estimates
	   set	       - holding the settings for the process
           type        - defining which type of mesh this actually is
	                 type=1 ==> linear    triangles
			 type=2 ==> quadratic triangles

   Output: marker      - vector of length msh.el_nr with
                         marker[i]==1 -> element i isto be refined,
           nrmarked    - number of marked elements, i.e. number of
                         non-zero entries in marker (given by reference)
           globest     - value of the global estimate (given by reference)

   Return: SUCCESS     - success
           FAIL        - failure, see error message, output will not be
	                 valid
*/
			){
  FIDX i,j, el_nr;
  double nk_max;
  double n_global;

  el_nr = (*m).el_nr;

  if (nk->len!=el_nr)
    {
      fprintf(stderr,"gen_error_est_marker: element_nr does not match nk\n");
      return FAIL;
    }
  

  /* determine the largest nk*/
  nk_max=0.0;
  n_global=0.0;

  for (i=0; i<el_nr; i++) 
    {
      if (nk->V[i]>nk_max) nk_max=nk->V[i];
      n_global += nk->V[i];
    }  

  /* decide which elements are to be marked */
  if((*set).adap_mark==0)
    {  /* get maximum and mark all nk that are a certain 
	  percentage of it */
      double tol_nk; 	
      tol_nk = (*set).adap_mark_par*(*set).adap_mark_par*nk_max;
      j=0;
      for (i=0; i<el_nr; i++) 
        {
          if (nk->V[i]>=tol_nk)
            {      
	      marker[i]=1;
	      j++;
	    }
        }
      if (j==0)    
	{ /* Check if an element has been marked */
	  printf("gen_error_est_marker:"
		 " No elements have been marked! nk_max=%f, tol=%f\n",
		 nk_max, (*set).adap_mark_par);
	  return FAIL;
	}
    }
  else if ((*set).adap_mark==1)
    {  /* First sort the list, then mark the largest percentage */
      int *nk_pos;
      TRY_MALLOC( nk_pos, el_nr, int, gen_error_est_marker);
      for (i=0; i<el_nr; i++) nk_pos[i]=i;
      heapSort(nk->V, nk_pos, el_nr);
      j = (int) (((*set).adap_mark_par)*el_nr + 0.5);
      if (j < 1)
	{ /* Make sure at least one element is marked */
	  fprintf(stderr,"Warning in gen_error_est_marker:"
		  " adap_mark_par=%5.1e too small.\n Marker is set "
		  "to at least the element with the largest error.\n",
		  (*set).adap_mark_par );
	  j=1;
	}
      for (i=(el_nr-j); i<el_nr; i++) marker[nk_pos[i]]=1;
      free(nk_pos);
    }
  else if ((*set).adap_mark==2)
    {  /* First sort the list, then mark the largest until percentage
	  of global error is marked */
      int *nk_pos;
      double global_part, global_goal;
      TRY_MALLOC( nk_pos, el_nr, int, gen_error_est_marker);
      for (i=0; i<el_nr; i++) nk_pos[i]=i;
      heapSort(nk->V, nk_pos, el_nr);
      global_goal=(*set).adap_mark_par*n_global;
      global_part=0.0;
      j=0;
      for (i=(el_nr-1); ((i>=0)&&(global_part<global_goal)) ; i--)
	{
	  marker[nk_pos[i]]=1;
	  global_part += nk->V[i];
	  j++;
	}   
      free(nk_pos);
    }
  else
    {
      fprintf(stderr, "gen_error_est_marker: unknown marking type:"
	      " type=%d\n", (int) (*set).adap_mark);
      return FAIL;
    }
  
  printf("Marking:"
	 " %d elements marked (=%f). nk_max = %8.3e, n_global = %8.3e\n",
	 (int) j,((double) j)/((double) el_nr), sqrt(nk_max), sqrt(n_global)   ); 
  *nrmarked=j;
  *globest = sqrt(n_global);
  
  return SUCCESS;
}






/*FUNCTION*/
int gen_band_marker_tx(struct mesh *m, 
		       FIDX *marker, FIDX *nrmarked, 
		       double *c_band, double d_band, double width_band,
		       int type
/* marks the elements based that are in a band of the geometric domain
   as described below

   a point x is said to be in the band if
   abs(c^T x + d) <= width

   all elements which have at least one vertex in the band or which
   have points on both sides of the band will be marked

   Input:  m           - the mesh, must be either T1 or T2 elements
           nk          - the element wise error estimates
           c_band      - the vector c in the above definition of band,
                         must be of dimension m.dim
           d_band      - the scalar d in the above definition of band
           width_band  - the scalar width in the above definition of
                         band, width_band>0
	   type        - type of mesh, i.e. type==1 for T1, type==2
	                 for T2

   Output: marker      - vector of length msh.el_nr with
                         marker[i]==1 -> element i isto be refined,
           nrmarked    - number of marked elements, i.e. number of
                         non-zero entries in marker (given by reference)
           
   Return: SUCCESS     - success
           FAIL        - failure, see error message, output will not be
	                 valid
*/
			){
  FIDX i,j,k,dim, el_nr, el_w, vx_w, l_mctxELNOD1, l_mctxVXSTRT, nodes;

  switch(type)
    {
    case 1:
      l_mctxVXSTRT = MCT1VXSTRT;
      l_mctxELNOD1 = MCT1ELNOD1;
      nodes=3;
      break;
    case 2:
      l_mctxVXSTRT = MCT2VXSTRT;
      l_mctxELNOD1 = MCT2ELNOD1;
      nodes=3;
      break;
    default:
      fprintf(stderr,"gen_band_marker_tx: unknown type=%d\n",type);
      return FAIL;
    }


  el_nr = (*m).el_nr;
  el_w  = (*m).el_w;
  vx_w  = (*m).vx_w;
  dim   = (*m).dim;

  (*nrmarked)=0;
  for (i=0; i<el_nr; i++)
    {
      int is_in_band=0;
      int is_top_bot=0;

      for (j=0; j<nodes; j++)
	{
	  /* check where the j-th node of the element is, compute
	     c^Tx+d for it */
	  double cTxpd=d_band;
	  int this_top_bot;

	  for (k=0; k<dim; k++)
	    {
	      cTxpd += c_band[k]*
		(*m).vertex[(*m).elem[i*el_w+l_mctxELNOD1+j]*vx_w
			    + l_mctxVXSTRT+k];
	    }
	  
	  if (cTxpd < -width_band)
	    {  this_top_bot=-1; }
	  else if (cTxpd > +width_band)
	    {  this_top_bot=+1; }
	  else
	    {
	      this_top_bot=0;
	      is_in_band  =1;
	    }

	  if (is_top_bot==0)
	    {
	      is_top_bot=this_top_bot;
	    }
	  else if (is_top_bot*this_top_bot < 0)
	    {
	      /* there is one node top of the band and one bottom of
		 it, so mark this element */
	      is_in_band=1;
	    }

	} /* end loop j over nodes (j-th node) */
      
      marker[i]    = is_in_band;
      (*nrmarked) += is_in_band;
    } /* end loop i over elements */
  
  return SUCCESS;
}










/*FUNCTION*/
void heapSort(double *a, int *b, int len
/* Sorts a vector with the Heap Sort algorithm

   INPUT:        len   - lenght of array a and b
   
   INPUT/OUTPUT: a     - array whose elements are to be sorted 
   
   OUTPUT:       b     - array holding the new positions of the ordered elements

*/
){

  int start, end;

  /* start is assigned the index in a of the last parent node */
  start = (int) ((len - 2) / 2);
  while (start > -1) 
    {     
      /* sift down the node at index start to the proper place such 
	 that all nodes below the start index are in heap order */
      heapSort_siftDown(a, b, start, len-1);
      start--;
      /* after sifting down the root all nodes/elements are in heap order */
    }
	    
  end = len - 1;
  while (end > 0) 
    {
      /* swap the root(maximum value) of the heap with the last
	 element of the heap */
      heapSort_swap(a, b, end, 0);
      /* put the heap back in max-heap order */
      heapSort_siftDown(a, b, 0, end-1);
      /* decrease the size of the heap by one so that the previous 
	 max value will stay in its proper placement */
      end--;
    }
}
  
/*FUNCTION*/
void heapSort_siftDown(double *a, int *b, int start, int end
/* part of the Heap Sort algorithm

  INPUT/OUTPUT: a   - an unordered array a of length count
		b   - vector holding the new positions of the ordered elements

  INPUT:        end - represents the limit of how far down the heap
			to sift.
*/
){
  int child, root;
	
  root = start;
  
  while ( (root*2) < end )
    {	/* While the root has at least one child */
      child = root*2+1; /* root*2+1 points to the left child */
	    
      /* If the child has a sibling and the child's value is less than its sibling's... */
      if ( (child < end) && (a[child]<a[child+1]) )
        {
	  child++; /* ... then point to the right child instead */
	}
      if (a[root]<a[child]) /* out of max-heap order */
	{
	  heapSort_swap(a, b, root, child);
	  root = child; /* repeat to continue sifting down the child now */
	}
      else return;
    }

}
  
/*FUNCTION*/
void heapSort_swap(double *a, int *b, int pos1, int pos2
/* swaps the two entries pos1 and pos2 in the array a and b

  INPUT/OUTPUT : a     - an unordered array a of length count
                 b     - array holding the new positions 
		         of the ordered elements

  INPUT:         pos1,
                 pos2  -positions o be swapped
*/
){

  double tempa = a[pos1];
  int tempb = b[pos1];

  a[pos1] = a[pos2];
  b[pos1] = b[pos2];

  a[pos2] = tempa;
  b[pos2] = tempb;

}
