/***********************************************************************

    This file is part of
    FEINS, Finite Element Incompressible Navier-Stokes solver,
    which is expanding to a more general FEM solver and toolbox,
    Copyright (C) 2003--2008, Rene Schneider 
    <rene.schneider@mathematik.tu-chemnitz.de>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program. If not, see <http://www.gnu.org/licenses/>.

    Minor contributions to this program (for example bug-fixes and
    minor extensions) by third parties automatically transfer the
    copyright to the general author of FEINS, to maintain the
    possibility of commercial re-licensing. If you contribute but wish
    to keep the copyright of your contribution, make that clear in
    your contribution!

    Non-GPL licenses to this program are available upon request from
    the author.

************************************************************************/
/*
FILE navsto_adj.c
HEADER navsto_adj.h

TO_HEADER:


// useful text macros
#include "feins_macros.h"
// data structures 
#include "sparse_struct.h"
#include "navsto_struct.h"

*/

#include <math.h>

#include "adolc.h"


/* function prototypes */
#include "navsto_adj.h"

/* prototypes of external functions */
#include "sparse.h"
#include "mesh.h"
#include "cubature.h"
#include "elements.h"
#include "lin_solver.h"
#include "navsto_aux.h"

/*FUNCTION*/
int navsto_adj_convert(struct navsto_matrix *K
/* converts the system matrix of the Navier-Stokes system into its 
   transpose, including all lower level matrices (for multi grid),
   such that it can be used for an adjoint solve,

   the converting is not reversible!
   
   K = K^T;

   In/Out: K       - linearised Navier-Stokes-problem stiffness
                     matrix, must be the Newton linearisation!

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
		       ){
  FIDX i, j, d, row, mlvldof;
  int  err;
  
  FIDX len, dim, vx_nr;

  FIDX lvlmax, lmax, lvl;

  double *invdiag;

  struct sparse *oldF, *newF, *newFs;
  struct multilvl *ml;

  double one=1.0;

  dim    = (*K).dim;
  vx_nr  = (*K).vx_nr;
  ml     = (*K).mld;
  invdiag= (*K).mg->invdiag;
  lvlmax = (*K).lvlmax;
  lmax   = (*K).lvl;

  if ( (*ml).lmax!=lmax ) 
    {
      fprintf(stderr, "navsto_adj_convert: "
	      "K.lvl and K.ml.lmax mismatch!\n");
      return FAIL;
    }


  /* allocate space for the transposed matrices */
  TRY_MALLOC( newFs, lvlmax, struct sparse, navsto_adj_convert);
  for (i=0; i<lvlmax; i++)
    sparse_init( &newFs[i]);


  /* convert the advection diffusion part on all levels */
  for (lvl=lmax; lvl>=0; lvl--)
    {
      oldF=&(*K).Fs[lvl];
      newF=&newFs[lvl];

      len      = (*oldF).row_nr;

#ifndef NEW_ADJOINT
      /* incorporating dirichlet BC into the matrix is only necessary
	 for the old approach */

      /* implement the Dirichlet BC in the matrix */
      for (i=0; i<(*K).bn_nr; i++)
	{
	  for (j=0; j<dim; j++)
	    {
	      /* the node on the finest level */
	      row= j*vx_nr+(*K).nodes[i];
	      /* the node in the simple multilevel vector */
	      MLVLFINDENTRY( mlvldof, row, lvl, *ml);
	      if (mlvldof>=0)
		{
		  /* correct the 1/diag entry (since we are going to
		     change the diag entry of the matrix) */
		  invdiag[mlvldof]=1.0 /* ^-1 */;

		  /* the dof in the current level */
		  mlvldof-=(*ml).nlevl[lvl+1];
		  /* delete the line */
		  err=sparse_row_empty(oldF, mlvldof);
		  FUNCTION_FAILURE_HANDLE( err, sparse_row_empty,
					   navsto_adj_convert);
		  /* replace it by an identity */
		  err=sparse_add_local(oldF, NoTrans, 1, &mlvldof, 1,
				       &mlvldof, &one, 1);
		  FUNCTION_FAILURE_HANDLE( err, sparse_add_local,
					   navsto_adj_convert);
		}
	    }
	}
#endif

      /* transpose it */
      err=sparse_mat_transp(oldF, newF);
      FUNCTION_FAILURE_HANDLE( err, sparse_mat_transp,
			       navsto_adj_convert);

      /* free the old matrix */
      sparse_free(oldF);
    } /* next level */


  /* free the old (now empty) matrix structs */
  free((*K).Fs);

  /* make the new transposed matrices available */
  (*K).Fs = newFs;

#ifndef NEW_ADJOINT
  /* old approach */
  /* reset the inner counter and step */
  for (i=0; i<5; i++)
    {
      (*K).innercount[i]=0;
      (*K).innersteps[i]=-1;
    }
#else
  /* for the new approach we can simply use the steps from the forward
     solve, only reset the counter then */

  /* reset the inner counter and step */
  for (i=0; i<5; i++)
    {
      (*K).innercount[i]=0;

    }
  //(*K).innersteps[0]=15;

  /* generate transposed coarse grid matrix inverse */
#ifndef MG_F_NO_CMAT
  {
    FIDX lvx_nr, ndiri, *diris;
    
    lvx_nr = (*K).Fs[0].row_nr / dim;

    TRY_MALLOC( diris, lvx_nr*dim, FIDX,
		navsto_adj_convert);

    coarse_mat_free( &(*K).cmat);

    ndiri=0;
    for (i=0; i<(*K).bn_nr; i++)
      {
	FIDX node;
	node=(*K).nodes[i];
	if (node<lvx_nr)
	  for (d=0; d<dim; d++)
	    {
	      diris[ndiri]=node+d*lvx_nr;
	      ndiri++;
	    }
      }

    /* set the coarse grid matrix */
    err=coarse_mat_set( &(*K).Fs[0], ndiri, diris, 2,
			&(*K).cmat); 
    FUNCTION_FAILURE_HANDLE( err, coarse_mat_set,
			     navsto_adj_convert);
    free(diris);
  }
#endif

  /* still only if new Adjoint implementation... */
  /* need to transpose the Cp part of the matrix struct */
  { 
    /* a little hack to copy Cp out of the navsto_matrix struct */
    struct sparse Cp_old;
       
    Cp_old.type     = (*K).Cp.type;
    Cp_old.row_nr   = (*K).Cp.row_nr;
    Cp_old.rows     = (*K).Cp.rows;
    Cp_old.cols     = (*K).Cp.cols;
    Cp_old.A        = (*K).Cp.A;
    Cp_old.flex_rows= (*K).Cp.flex_rows;
    
    (*K).Cp.type     = SP_TYPE_UNDEF;
    (*K).Cp.row_nr   = 0;
    (*K).Cp.rows     = NULL;
    (*K).Cp.cols     = NULL;
    (*K).Cp.A        = NULL;
    (*K).Cp.flex_rows= NULL;
    
    err=sparse_mat_transp( &Cp_old, &( (*K).Cp) );
    FUNCTION_FAILURE_HANDLE( err, sparse_mat_transp,
			     navsto_adj_convert);

    sparse_free( &Cp_old ); 

    //coarse_mat_free(&(*K).cmat);
  }
#endif

  /* nothing to do with the B parts, that is handled by the
     multiplication routines */
  return SUCCESS;
}


/*FUNCTION*/
int navsto_adjmat_tim_vec(void *arg1, struct vector *vec,
			  struct vector *out
/* multiplies the matrix K from left to the vector vec,
   
   out = K * vec;

   Input:  arg1=
           K       - linearised Navier-Stokes-problem stiffness matrix
           vec     - vector

   Output: out     - resulting vector

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
		     ){
  struct vector x, y;
  struct navsto_matrix *K;

  int  err;
  FIDX i, j;
  FIDX dim, lvl, bigN, vx_nr, pvx_nr, *pdof;

  double pn, *hvec;

  K      = (navsto_matrix*) arg1;
  dim    = (*K).dim;
  lvl    = (*K).lvl;
  vx_nr  = (*K).vx_nr;
  pvx_nr = (*K).pvx_nr;
  pdof   = (*K).pdof;
  hvec   = (*K).mg->xl;
  bigN   = dim*vx_nr+pvx_nr;

  /* for the velocity part of the output vector, first apply the
     Navier-Stokes part (Fs(^T)), then the pressure part (\hat{B}^T),
     thats B^T whilst p_N=0
  */
  pn= (*vec).V[bigN-1];
  (*vec).V[bigN-1]=0.0;

  /* the Fs(^T) part */
  x.len   = dim*vx_nr;
  x.n_max = dim*vx_nr;
  x.V     = &(*vec).V[0];
  
  y.len   = dim*vx_nr;
  y.n_max = dim*vx_nr;
  y.V     = &(*out).V[0];
  
  err=sparse_mul_mat_vec( &(*K).Fs[lvl], &x, &y);
  FUNCTION_FAILURE_HANDLE( err, sparse_mul_mat_vec, 
			   navsto_adjmat_tim_vec);

  /* now B^T */
  for (i=0; i<dim; i++)
    {
      /* out_i += B_i^T * vec_p */
      x.len   = pvx_nr;
      x.n_max = pvx_nr;
      x.V     = &(*vec).V[dim*vx_nr];

      y.len   = vx_nr;
      y.n_max = vx_nr;
      y.V     = &(*out).V[i*vx_nr];

      err=sparse_mul_mat_vec_add_trans( &(*K).Bs[i], &x, &y);
      FUNCTION_FAILURE_HANDLE( err, sparse_mul_mat_vec_add_trans, 
			       navsto_adjmat_tim_vec);
    }

  /* reverse the modification of p_n */
  (*vec).V[bigN-1]=pn;


  /* out_p =  sum (B_i * Projector^T  * vec_i) */
  for (i=0; i<dim; i++)
    {
      for (j=0; j<vx_nr; j++)
	{
	  hvec[j]=(*vec).V[i*vx_nr+j];
	}
      /* the projector */
      for (j=0; j<(*K).bn_nr; j++)
	{
	  hvec[ (*K).nodes[j] ]   = 0.0 ;
	}

      x.len   = vx_nr;
      x.n_max = vx_nr;
      x.V     = hvec;

      y.len   = pvx_nr;
      y.n_max = pvx_nr;
      y.V     = &(*out).V[dim*vx_nr];

      /* B */
      if (i==0)
	{
	  err=sparse_mul_mat_vec( &(*K).Bs[0], &x, &y);
	  FUNCTION_FAILURE_HANDLE( err, sparse_mul_mat_vec, 
				   navsto_adjmat_tim_vec);
	}
      else
	{
	  err=sparse_mul_mat_vec_add( &(*K).Bs[i], &x, &y);
	  FUNCTION_FAILURE_HANDLE( err, sparse_mul_mat_vec_add, 
				   navsto_adjmat_tim_vec);
	}
    }

  /* add p_n * multiply weights to the pressure dofs */
  for (i=0; i<vx_nr; i++)
    {
      if (pdof[i]!=-1)
	{
	  (*out).V[dim*vx_nr + pdof[i] ] += pn * (*K).weight[i];;
	}
    }

  return SUCCESS;
}

/*FUNCTION*/
int navsto_adj_w_precon(void *arg1, struct vector *in,
			void *notused,
			struct vector *out
/* performs preconditioning for the adjoint equation of Navier-Stokes
   problems,  

     out = C^-1 * in

   where C^-1 is the Wathen block preconditioner:
        
   C^-1=[I                   ] [ I   ] [F^-1  ] velocity space
        [  (Mp^-1)*Fp*(Ap^-1)]*[-B  I]*[     I] pressure space

   where F is the advection diffusion operator, Mp is the pressure
   space mass matrix, Fp the pressure space advection diffusion
   operator, Ap the pressure space laplace operator
   (F, Fp and Ap are discretisations of these operators)

   Input:  arg1=
           K       - Navier-Stokes-matrix struct
           in      - input vector
	   notused - well, it is not used but in the interface to
                     allow this function to be used as a
                     preconditioner

   Output: out    - (given by reference), C^-1* in

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
		 ){
  FIDX i, j, dim, vx_nr, pvx_nr;
  int  err, iter;
  double resi, alpha;
  FIDX *pdof;
  struct vector rhs, xi;
  struct navsto_matrix *K;

  double *hvec;

  K      = (navsto_matrix*) arg1;
  vx_nr  = (*K).vx_nr;
  pvx_nr = (*K).pvx_nr;
  pdof   = (*K).pdof;
  hvec   = (*K).mg->xl;
  dim    = (*K).dim;

  if (((*in).len!=(*out).len)||((*in).len<dim*vx_nr+pvx_nr))
    {
      fprintf(stderr,
	      "navsto_adj_w_precon: dimensions make no sense!\n");
      return FAIL;
    }

  err=vector_alloc( &rhs, dim*vx_nr);
  FUNCTION_FAILURE_HANDLE( err, vector_alloc, navsto_adj_w_precon);

  /* velocity components */
  /* copy in to rhs */
  for (i=0; i<dim*vx_nr; i++)
    {
      rhs.V[i]=(*in).V[i];
    }

  /* prepare to solve F out = rhs */
  xi.len=dim*vx_nr;
  xi.n_max=dim*vx_nr;
  xi.V=&(*out).V[0];

  /* solve F out = rhs */
  if ((*K).innersteps[0]<0)
    {
      err=GMRES( 30, 500, 2, (*K).innereps, 0, &xi, &resi, &iter,
		 navsto_advecdiff_tim_vec, navsto_adj_vel_MG,
		 K, &rhs, NULL); /* */
      if (iter>1)
	{
	  (*K).innersteps[0]=iter;
	  printf("set adj_iter=%d\n", iter);
	}
      else
	{
	  printf("wait before setting adj_iter\n");
	}
    }
  else
    {
      err=GMRES( 30, (*K).innersteps[0], 3, 0.0, 0, &xi, &resi, &iter,
		 navsto_advecdiff_tim_vec, navsto_adj_vel_MG,
		 K, &rhs, NULL); /* */
    }
  FUNCTION_FAILURE_HANDLE( err, GMRES, navsto_adj_w_precon);
  (*K).innercount[0]+=iter;

  /* pressure components C^{-1} */ 
  /* copy in to rhs=-in.p */
  for (i=0; i<pvx_nr; i++)
    {
      rhs.V[i]=-(*in).V[dim*vx_nr+i];
    }
  rhs.len=pvx_nr;

  /* rhs=[B -I]*[out.vel; rsh] */
  for (j=0; j<dim; j++)
    {
      for (i=0; i<vx_nr; i++)
	hvec[i]=(*out).V[j*vx_nr+i];
      /* the projector */
      for (i=0; i<(*K).bn_nr; i++)
	{
	  hvec[ (*K).nodes[i] ]   = 0.0 ;
	}
      xi.len=vx_nr;
      xi.n_max=vx_nr;
      xi.V=hvec;
      err=sparse_mul_mat_vec_add( &(*K).Bs[j], &xi, &rhs);
      FUNCTION_FAILURE_HANDLE( err, sparse_mul_mat_vec_add,
			       stokes_projector_w_precon);
    }
  /* rhs=-rhs (= [-B P^T]*[out.vel; in.p]) */
  for (i=0; i<vx_nr; i++)
    {
      if ((*K).pdof[i]!=-1)
	{
	  rhs.V[(*K).pdof[i] ]= -rhs.V[(*K).pdof[i] ];
	}
    }

  /* the Wathen preconditioner does not account for the pressure
     component alpha associated with the pressure weights (=transpose
     of the mean pressure = 0 condition)
     so we do that by taking this component such that it minimizes the
     remaining right hand side
  */
  alpha=0;
  for (i=0; i<vx_nr; i++)
    {
      if ((*K).pdof[i]!=-1)
	{
	  alpha += rhs.V[(*K).pdof[i] ]* (*K).weight[i];
	}
    }
  /* adjust the right hand side */
  /*#warning "damped the projection in the adjoint preco"
    alpha *= 0.5; /* */
  for (i=0; i<vx_nr; i++)
    {
      if ((*K).pdof[i]!=-1)
	{
	  rhs.V[(*K).pdof[i] ] -= alpha * (*K).weight[i];
	}
    }

  xi.len=pvx_nr;
  xi.n_max=pvx_nr;
  xi.V=&(*out).V[dim*vx_nr];

  /* solve Mp out = rhs,  (Mp=Mp^T) */
  if ((*K).innersteps[dim+1] <0)
    {
      err=PCG( 10000, 2, (*K).innereps, 0, &xi, &resi, &iter,
	       sparse_mul_mat_vec, vector_diag_scale,
	       &(*K).M, &rhs,  &(*K).Minvdiag);
      (*K).innersteps[dim+1]=iter;
    }
  else
    {
      err=PCG( (*K).innersteps[dim+1], 0, 0.0,0, &xi, &resi, &iter,
	       sparse_mul_mat_vec, vector_diag_scale,
	       &(*K).M, &rhs,  &(*K).Minvdiag);
      if (err==10) err=SUCCESS; /* do fixed number of iterations,
				   remaining residual is ignored */
    }
  FUNCTION_FAILURE_HANDLE( err, PCG, navsto_adj_w_precon);
  (*K).innercount[dim+1]+=iter;

  /* multiply by Fp:
     rhs = Fp * out ( = (Ap + Cp^T)*out ) */
  err=sparse_mul_mat_vec(&(*K).Ap, &xi, &rhs);
  FUNCTION_FAILURE_HANDLE( err, sparse_mul_mat_vec,
			   navsto_adj_w_precon);
  err=sparse_mul_mat_vec_add_trans(&(*K).Cp, &xi, &rhs);
  FUNCTION_FAILURE_HANDLE( err, sparse_mul_mat_vec_add_trans,
			   navsto_adj_w_precon);

  /*************************** test *************************
    project the rhs for the Ap solve
   */
#warning "explain additional projection step in adjoint befroe Ap solve!"  
  {
    double sumAprhs=0;
    for (i=0; i<vx_nr; i++)
      {
	if ((*K).pdof[i]!=-1)
	  {
	    sumAprhs += rhs.V[(*K).pdof[i] ];
	  }
      }
    /* adjust the right hand side */
    for (i=0; i<vx_nr; i++)
      {
	if ((*K).pdof[i]!=-1)
	  {
	    rhs.V[(*K).pdof[i] ] -= sumAprhs * (*K).weight[i];
	  }
      }
  }


  /* solve Ap out = rhs,   (Ap=Ap^T) */
  if ((*K).innersteps[dim] <0)
    {
      err=PCG( 10000, 2, (*K).innereps, 0, &xi, &resi, &iter,
	       sparse_mul_mat_vec, navsto_precon_p_lapl_bpx,
	       &(*K).Ap, &rhs, K); /* */
      (*K).innersteps[dim]=iter;
    }
  else
    {
      err=PCG( (*K).innersteps[dim], 0, 0.0, 0, &xi, &resi, &iter,
	       sparse_mul_mat_vec, navsto_precon_p_lapl_bpx,
	       &(*K).Ap, &rhs, K); /* */
      if (err==10) err=SUCCESS; /* do fixed number of iterations,
				   remaining residual is ignored */
    }
  FUNCTION_FAILURE_HANDLE( err, PCG, navsto_adj_w_precon);
  (*K).innercount[dim]+=iter;


  /* the last pressure entry has a special role, which is not covered
     by the preconditioner */
  (*out).V[(*in).len-1]=alpha; 

  /* C^{-1} done */

  /* done */
  vector_free( &rhs );

  return SUCCESS;
}



/*FUNCTION*/
int navsto_adj_vel_MG(void *arg1, struct vector *in,
		      void *notused, struct vector *out
/* performs the boudary condition projection and multigrid
   preconditioning for the advection diffusion subproblems of the
   Wathen block preconditioner for navier stokes problems, 
     out = P*C^-1*P^T * in
   where P projects the velocity components of boundary nodes to zero,
   (such that addition of a projected vector doesn't change the
   velocity there), and C^-1 repressents the action of the multigrid
   preconditioning, that is one V cycle
   
   Input:  arg1=
           K       - navsto_matrix struct
           in      - input vector
	   notused - well, it is not used but in the interface to
                     allow this function to be used as a
                     preconditioner

   Output: out    - (given by reference), P*P^T* in

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
		 ){
  FIDX d, i, j, len;
  int  err;

  struct navsto_matrix *K;

  FIDX dim, vx_nr, eh_w, eh_nr, eg_w, eg_nr;
  struct vector xi, bi, invdiagi, dxi;
  struct mesh *m;
  struct multilvl *ml;
  struct mgdata *mg;
  double *xl, *dxl, *bl, *invdiag;

  FIDX lvl, lmax, lmin, lvl_vx, smooths, vcycles, vccount;

  double drow;

  double normres, normb, stop_eps;

  K      = (navsto_matrix*) arg1;
  dim    = (*K).dim;
  vx_nr  = (*K).vx_nr;
  len    = dim * vx_nr;

  if ((len!=(*in).len)||(len!=(*out).len))
    {
      fprintf(stderr,
	      "navsto_adj_vel_MG: in or out length doesn't match\n");
      fprintf(stderr,
	      "should be: %d, in.len=%d, out.len=%d\n", (int) len,
	      (int) (*in).len, (int) (*out).len);
      fprintf(stderr, "dim=%d    vx_nr=%d\n", (int) dim, (int) vx_nr);
      return FAIL;
    }

  m  = (*K).msh;
  ml = (*K).mld;
  mg = (*K).mg;

  xl   = (*mg).xl;
  dxl  = (*mg).dxl;
  bl   = (*mg).bl;
  invdiag = (*mg).invdiag;

  smooths  = (*mg).smooths;
  vcycles  = (*mg).vcycles;
  stop_eps = (*mg).stop_eps;

  vx_nr= (*K).vx_nr;
  dim  = (*K).dim;
  eg_nr= (*m).eg_nr;
  eg_w = (*m).eg_w;
  eh_nr= (*m).eh_nr;
  eh_w = (*m).eh_w;

  lmax=(*ml).lmax;

  lmin=0; /*lmax-2;*/

  if (lmin<0) lmin=0;

  if (((*in).len!=(*out).len)||((*in).len!=dim*vx_nr))
    {
      fprintf(stderr,
	      "navsto_adj_vel_MG: dimensions make no sense!\n");
      return FAIL;
    }

  if ( (*K).lvl!=lmax ) 
    {
      fprintf(stderr, "navsto_adj_vel_MG: "
	      "K.lvl and K.ml.lmax mismatch!\n");
      return FAIL;
    }


  /* copy in to bl */
  for (i=0; i<(*in).len; i++)
    {
      bl[i]=(*in).V[i];
    }

  /* apply C^-1 */

  /* set the vectors for multilpication with the stiffness matrix */
  dxi.V     = &drow;
  dxi.n_max = 1;
  dxi.len   = 1;

  if (vcycles==0)
    {
      fprintf(stderr, "stokes_projector_part_MG: vcycles==0 not "
	      "defined\n");
      return FAIL;
    }
  if (stop_eps<0.0)
    {
      fprintf(stderr, "stokes_projector_part_MG: stop_eps<0 not "
	      "sensible\n");
      return FAIL;
    }
 
  if (vcycles<0)
    {
      /* compute normb */
      normb=0.0;
      for (i=0; i<(*in).len; i++)
	{
	  normb+=bl[i]*bl[i];
	}
      normb=sqrt(normb);
    }
  else
    {
      normb=1.0;
      stop_eps=1;
    }
  /* make sure the process starts */
  normres=2*stop_eps*normb;

  /* perform at most vcycles V-cycles */
  for (vccount=0; (vccount<abs(vcycles))&&(normres>stop_eps*normb);
       vccount++)
    {
      /* V-cycle downward */
      for (lvl=lmax; lvl>=lmin; lvl--)
	{ 
	  lvl_vx=(*ml).nlevl[lvl]-(*ml).nlevl[lvl+1];
	  
	  if ((vccount==0)||(lvl<lmax))
	    {
	      /* set xl=0 */
	      for (j=(*ml).nlevl[lvl+1]; j<(*ml).nlevl[lvl]; j++)
		xl[j]=0.0;
	    }

	  /* make x_l, b_l, invdiag_l accessible as vector */
	  xi.len=lvl_vx;
	  xi.n_max=lvl_vx;
	  xi.V=&xl[(*ml).nlevl[lvl+1]];

	  bi.len=lvl_vx;
	  bi.n_max=lvl_vx;
	  bi.V=&bl[(*ml).nlevl[lvl+1]];

	  invdiagi.len=lvl_vx;
	  invdiagi.n_max=lvl_vx;
	  invdiagi.V=&invdiag[(*ml).nlevl[lvl+1]];

	  if ((lvl==0)&&((*K).cmat.nr!=0))
	    {
	      /* use the coarse grid solver */
	      struct vector bcoarse;

	      /* apply the projector first */
	      for (i=0; i<(*K).bn_nr; i++)
		for (d=0; d<dim; d++)
		  {
		    FIDX node, nodel;
		    node=d*vx_nr+(*K).nodes[i];
		    MLVLFINDENTRY(nodel, node, lvl, *ml);
		    if (nodel>=0)
		      {
			bl[nodel]=0.0;
		      }
		  }

	      bcoarse.len=lvl_vx;
	      bcoarse.n_max=lvl_vx;
	      bcoarse.V=&bl[(*ml).nlevl[lvl+1]];
	      err=coarse_mat_solve( &(*K).cmat, Trans,
				    &bcoarse, &xi); 
	      FUNCTION_FAILURE_HANDLE( err, coarse_mat_solve,
				       navsto_projector_vel_MG);    
	    }
	  else
	    {
	      /* do smooths Gauss-Seidel sweeps forward */
	      err=sparse_GS_sweep_fwd( &(*K).Fs[lvl], &bi,
				       &invdiagi, smooths, &xi);
	      FUNCTION_FAILURE_HANDLE( err, sparse_GS_sweep_fwd,
				       navsto_projector_vel_MG);
	      
	      if (lvl==0)
		{
		  /* do smooths Gauss-Seidel sweeps backward */
		  err=sparse_GS_sweep_bwd( &(*K).Fs[lvl], &bi,
					   &invdiagi, smooths, &xi);
		  FUNCTION_FAILURE_HANDLE( err, sparse_GS_sweep_bwd,
					   navsto_projector_vel_MG);
		}
	    }

	  if (lvl>0)
	    {
	      /* compute the residual on this lvl */
	      /* compute the matrix vector product,
		 Cm*x */
	      dxi.V   = &dxl[(*ml).nlevl[lvl+1]];
	      dxi.len = lvl_vx;
	      dxi.n_max = lvl_vx;
	      err=sparse_mul_mat_vec( &(*K).Fs[lvl], &xi, &dxi);
	      FUNCTION_FAILURE_HANDLE( err, sparse_mul_mat_vec,
				       navsto_adj_vel_MG);

	      /* now change dxl to rl=bl-A*xl */
	      for (j=(*ml).nlevl[lvl+1]; j<(*ml).nlevl[lvl]; j++)
		dxl[j]=bl[j]-dxl[j];

	      /* restrict rl to r_l-1 */
	      err=mg_restrict_t2( mg, lvl, lvl-1, dxl);
	      FUNCTION_FAILURE_HANDLE( err, mg_restrict_2,
				       navsto_adj_vel_MG);
	      /* copy r_l-1 to b_l-1 */
	      for (j=(*ml).nlevl[lvl]; j<(*ml).nlevl[lvl-1]; j++)
		{
		  bl[j]=dxl[j];
		}

	      dxi.V     = &drow;
	      dxi.n_max = 1;
	      dxi.len   = 1;
	    }
	} /* end V-cycle downward */

      /* V-cycle upward */
      for (lvl=lmin+1; lvl<=lmax; lvl++)
	{ 
	  lvl_vx=(*ml).nlevl[lvl]-(*ml).nlevl[lvl+1];

	  /* apply the update computed in the lower level */
	  /* copy the update to dx */
	  for (j=(*ml).nlevl[lvl]; j<(*ml).nlevl[lvl-1]; j++)
	    dxl[j]=xl[j];
	  /* interpolate dx to lvl */
	  err=mg_interpolate_t2( mg, lvl-1, lvl, dxl);
	  FUNCTION_FAILURE_HANDLE( err, mg_interpolate_t2,
				   navsto_adj_vel_MG);

	  /* apply the update to xl */
	  for (j=(*ml).nlevl[lvl+1]; j<(*ml).nlevl[lvl]; j++)
	    xl[j]+=dxl[j];

	  /* make x_l, b_l, invdiag_l accessible as vector */
	  xi.len=lvl_vx;
	  xi.n_max=lvl_vx;
	  xi.V=&xl[(*ml).nlevl[lvl+1]];

	  bi.len=lvl_vx;
	  bi.n_max=lvl_vx;
	  bi.V=&bl[(*ml).nlevl[lvl+1]];

	  invdiagi.len=lvl_vx;
	  invdiagi.n_max=lvl_vx;
	  invdiagi.V=&invdiag[(*ml).nlevl[lvl+1]];
	  
	  /* do smooths Gauss-Seidel sweeps backward */
	  err=sparse_GS_sweep_bwd( &(*K).Fs[lvl], &bi,
				   &invdiagi, smooths, &xi);
	  FUNCTION_FAILURE_HANDLE( err, sparse_GS_sweep_bwd,
				   navsto_projector_vel_MG);
	} /* end V-cycle upward */

      if (vcycles<0)
	{
	  /* compute the residual norm */
	  /* out=A*x */
	  err=sparse_mul_mat_vec(&(*K).Fs[lmax], &xi, out);
	  FUNCTION_FAILURE_HANDLE( err, sparse_mul_mat_vec,
				   navsto_adj_vel_MG);
	  /* out -= rhs = in */
	  for (i=0; i<(*out).len; i++)
	    {
	      (*out).V[i]-=bl[i];
	    }
	  normres=0.0;
	  for (i=0; i<(*out).len; i++)
	    {
	      normres+=(*out).V[i]*(*out).V[i];
	    }
	  normres=sqrt(normres);
	  /* printf("vccount=%3d    normres=%+8.1e\n", vccount, normres); /* */
	}
    } /* end V-cycles loop */
  
  (*mg).vccount+=vccount;

  /* copy xl to out */
  for (i=0; i<(*out).len; i++)
    {
      (*out).V[i]=xl[i];
    }


  /* done */

  return SUCCESS;
#undef SMOTHS
}




/*FUNCTION*/
int navsto_Psi_dRdF_t21(struct vector *Psi_dRdF, 
			struct vector *Psi,
			struct vector *sol,
			struct navsto_matrix *K,
			struct mesh *msh
/* evaluates the Psi times the derivative of the residual with respect
   to the position, the residual is defined as the finite element
   discretisation (with P2-P1 triangles) of the Navier-Stokes equation

         -nu Laplace(u) + u * grad u + grad p = f
                                        div u = 0

   with boundary conditions as given in K,

   The evaluation of the derivatives via the discrete adjoint method 
   requires the evaluation of

     DI   dI         dR
     -- = -- - Psi^T -- .
     DF   dF         dF

   This routine provides the 

                     dR
               Psi^T --
                     dF

   part. (Note that the "-" is omitted!)

   In/Out: Psi     - the adjoint solution vector for the NS-system,
                     it is mostly read only, but the values at
                     dirichlet nodes are set to zero,
		     SO IT IS MODIFIED!
	   
   Input:  sol     - the solution vector of the NS-system (where the
                     system is to be linearized)
	   K       - the Navier-Stokes matrix struct, used to get the 
	             pressure dofs and boundary conditions
	   m       - the mesh, the shape defining node positions are
	             marked in there as well

   Output: Psi_dRdF- Psi times jacobian matrix of the residual with
                     respect to the node positions, such that
                     Psi_dRdF[d*vx_nr+k] presents derivative values
                     respect to the d-th component of the k-th node,
		     thus this vector has to be of length dim*vx_nr,


   Return: SUCCESS - success
           FAIL    - failure, see error message, output will not be
                     valid
*/
		    ){
  FIDX el, i, j, k, l, m, n, r, s1, t;
  int  err;

  FIDX dim=2, bas_n1, bas_n2, bas_n2_2, bas_n2_3, vx_nr, eg_nr, hi_nr,
    bd_nr, el_w, vx_w, eg_w, fc_w, bd_w, hi_w, fu_w,
    bigN;
  FIDX subtypes[2];

  struct int_data iform; /* integration formula 2d   */

  double *eldRdxv;       /* element residual derivative velocity comps */
  double *eldRdxp;       /* element residual derivative pressure comps */
  double *eldwdx;        /* element pressure weights derivative  */

  double *Jac, *Jacinv;  /* Jacobian of the element mapping and its
			    inverse */
  double detJac;         /* determinant of the Jacobian */
  double *ddet_dx;       /* derivative of |detJac| wrt node positions */
  double weight;         /* weight of the int. point times |detJac| */
  

  double *Jinvgrad1;     /* inverse Jacobian times gradphi1 */
  double *Jinvgrad2;     /* inverse Jacobian times gradphi2 */
  double *eldgphi_dx;    /* derivative of the real world gradient of
			    the quadratic basis wrt node positions */

  double elp;            /* pressure at the integration point */
  double *elu;           /* velocity at the integration point */
  double *elgu;          /* velocity gradient at the integration point */
  double *eldgu_dx;      /* derivative of the velocity gradient wrt
			    node positions */ 

  FIDX    *dofs1, *dofs2, *elpdofs;
                         /* degrees of freedom to which the local
			    matrices correspond */
  double nu;             /* nondimensional parameter describing the
			    flow, nu= 1/Re, where Re is the Reynolds
			    number, thus nu=mu/(rho*U*d), where mu is
			    the viscosity of the fluid, rho the
			    density, U is the velocity scale (the
			    solution velocities relate to this scale),
			    d is the lenght scale (the scale by which
			    lengths in the mesh are given),
			    nu is taken from m.param[MC2XPANUPO] */

  FIDX pvx_nr;           /* number of pressure dofs */
  FIDX *pdofs;           /* vector specifying the dof for a pressure
			    node */

  double Psi_n;          /* last row of Psi has special meaning,
			    therefore we will keep it outside Psi,
			    temporarily setting the value in Psi to
			    zero so it doesn't damage things in
			    Psi*dRdF */
  /* some helpers to work out the derivative of the last row */
  double *sumpdwdF;      /* sum of pressure times deriv of weight */
  double sumw;           /* sum of weights */
                        

  double *phi1, *phi2, *gradp1, *gradp2, *hessp2;

  /****************   init ******************************************/
  /* get integration formula */
  subtypes[0]=1;
  subtypes[1]=2;

  /* for t21 with linear isoparametric mappings only degree 5
     necessary, but not available yet ==> use degree 7 */
  err=cubature_bases( dim, 7, tria, 2, subtypes, &iform); 
  FUNCTION_FAILURE_HANDLE( err, cubature_bases, navsto_Psi_dRdF_t21);

  /* make phi and gradphi better accessible */
  phi1   = (iform.bases[0]->phi);
  gradp1 = (iform.bases[0]->gradphi);
  bas_n1 = (iform.bases[0]->num_basis);
  phi2   = (iform.bases[1]->phi);
  gradp2 = (iform.bases[1]->gradphi);
  hessp2 = (iform.bases[1]->hessphi);
  bas_n2 = (iform.bases[1]->num_basis);

  vx_nr = (*msh).vx_nr;
  eg_nr = (*msh).eg_nr;
  hi_nr = (*msh).hi_nr;
  bd_nr = (*msh).bd_nr;
  vx_w  = (*msh).vx_w;
  el_w  = (*msh).el_w;
  eg_w  = (*msh).eg_w;
  fc_w  = (*msh).fc_w;
  bd_w  = (*msh).bd_w;
  hi_w  = (*msh).hi_w;
  fu_w  = (*msh).fu_w;

  nu    = (*msh).para[MC2XPANUPO];
  pdofs = (*K).pdof;
  pvx_nr= (*K).pvx_nr;
  bigN  = dim*vx_nr+pvx_nr;

  if (dim > 2)
    {
      /* cry */
      fprintf(stderr,
	      "navsto_Psi_dRdF_t21: dim >2 not implemented (Jacinv)\n");
      return FAIL;
    }

  if (bas_n1>bas_n2)
    {
      /* cry */
      fprintf(stderr,
	      "navsto_Psi_dRdF_t21: bas_n1>bas_n2 ???????\n");
      return FAIL;
    }

  if ( Psi_dRdF->len != dim*vx_nr )
    {
      fprintf(stderr,
	      "navsto_Psi_dRdF_t21: Psi_dRdF has wrong size\n");
      return FAIL;
    }
  if ( Psi->len != bigN )
    {
      fprintf(stderr,
	      "navsto_Psi_dRdF_t21: Psi has wrong size\n");
      return FAIL;
    }
  if ( sol->len != bigN )
    {
      fprintf(stderr,
	      "navsto_Psi_dRdF_t21: sol has wrong size\n");
      return FAIL;
    }


  /* further init */
  bas_n2_2=bas_n2*bas_n2;    /* bas_n2^2 */
  bas_n2_3=bas_n2*bas_n2_2;  /* bas_n2^3 */

  /* allocate memory for the matrices/vectors on the element */
  TRY_MALLOC( Jac, dim*dim, double, navsto_Psi_dRdF_t21);
  TRY_MALLOC( Jacinv, dim*dim, double, navsto_Psi_dRdF_t21);
  TRY_MALLOC( ddet_dx, dim*bas_n1, double, navsto_Psi_dRdF_t21);

  TRY_MALLOC( Jinvgrad1, dim*bas_n1, double, navsto_Psi_dRdF_t21);
  TRY_MALLOC( Jinvgrad2, dim*bas_n2, double, navsto_Psi_dRdF_t21);
  TRY_MALLOC( eldgphi_dx, bas_n2*dim*dim*bas_n1, double,
	      navsto_Psi_dRdF_t21);

  TRY_MALLOC( elu, dim, double, navsto_Psi_dRdF_t21);
  TRY_MALLOC( elgu, dim*bas_n2, double, navsto_Psi_dRdF_t21);
  TRY_MALLOC( eldgu_dx, dim*dim*dim*bas_n1, double, navsto_Psi_dRdF_t21);

  TRY_MALLOC( eldRdxv, dim*bas_n2*dim*bas_n1, double, navsto_Psi_dRdF_t21);
  TRY_MALLOC( eldRdxp, bas_n1*dim*bas_n1, double, navsto_Psi_dRdF_t21);
  TRY_MALLOC( eldwdx, dim*bas_n1, double, navsto_Psi_dRdF_t21);

  TRY_MALLOC( dofs1, dim*bas_n1, FIDX, navsto_Psi_dRdF_t21);
  TRY_MALLOC( dofs2, dim*bas_n2, FIDX, navsto_Psi_dRdF_t21);
  TRY_MALLOC( elpdofs, bas_n1, FIDX, navsto_Psi_dRdF_t21);

  TRY_MALLOC( sumpdwdF, dim*vx_nr, double, navsto_Psi_dRdF_t21);

  /* set Psi_dRdF, sumpdwdF to zero */
  for (i=0; i<dim*vx_nr; i++)
    {
      Psi_dRdF->V[i]=0.0;
      sumpdwdF[i]=0.0;
    }

  /* intialise sums */
  sumw=0.0;


#ifndef NEW_ADJOINT
  /* stuff that is only required by the old approach */

  /* the velocities at  the dirichlet nodes are set with an identity
     matrix, independent on the node positions, thus the corresponding
     rows of dRdF are zero, we implement this by setting Psi zero for
     those rows, avoiding case distinctions in the assembly part */
  for (i=0; i<(*K).bn_nr; i++)
    {
      for (j=0; j<dim; j++)
	{
	  Psi->V[ (*K).nodes[i]+j*vx_nr ]=0.0;
	}
    }
  /* keep last row of Psi seperately and overwrite it in Psi with zero
     to avoid collisions in the multiplication Psi*dRdF */
  Psi_n          = Psi->V[bigN-1];
  Psi->V[bigN-1] = 0.0;

  printf("\ntest: Psi_Lambda=%e\n\n", Psi_n);
#else
  /* just to avoid trouble */
  Psi_n          = 0.0; 
#endif

  /* loop over all elements */
  for (el=0; el<(*msh).el_nr; el++)
    {
      /* define the loval dofs */
      for (i=0; i<dim; i++)
	for (j=0; j<bas_n1; j++)
	  dofs1[i*bas_n1+j]=i*vx_nr+(*msh).elem[el*el_w+MCT2ELNOD1+j];
      for (i=0; i<dim; i++)
	for (j=0; j<bas_n2; j++)
	  dofs2[i*bas_n2+j]=i*vx_nr+(*msh).elem[el*el_w+MCT2ELNOD1+j];
       for (i=0; i<bas_n1; i++)
	 elpdofs[i]=dim*vx_nr+pdofs[dofs1[i]];

      /* set eldRdxv to zero */
      for (i=0; i<dim*bas_n2*dim*bas_n1; i++) 
	{
	  eldRdxv[i]=0.0;
	}
      /* set eldRdxp to zero */
      for (i=0; i<bas_n1*dim*bas_n1; i++) 
	{
	  eldRdxp[i]=0.0;
	}

      /* set eldRndx, eldsumwdx to zero */
      for (i=0; i<dim*bas_n1; i++) 
	{
	  eldwdx[i]=0.0;
	}

      /* compute the Jacobian of the linear isoparmetric element
	 mapping */
      /* Jac=0 */
      for (i=0; i<dim*dim; i++)
	Jac[i]=0.0;
	  
      /* Jac = sum_{i=nodes} vertex(i)*gradphi1_i^T */
      for (i=0;i<bas_n1; i++)
	{
	  for (j=0; j<dim; j++)
	    {
	      for (r=0; r<dim; r++)
		{
		  Jac[j*dim+r]+= 
		    (*msh).vertex[dofs2[0+i]*vx_w+MCT2VXSTRT+j]
		    * gradp1[0*bas_n1*dim +i*dim +r];
		}
	    }
	}

      /* get detJac */
      detJac=Jac[0]*Jac[3]-Jac[2]*Jac[1];
	  
      /* get Jacinv (here direct) */
      Jacinv[0]=1.0/detJac*Jac[3];
      Jacinv[1]=-1.0/detJac*Jac[1];
      Jacinv[2]=-1.0/detJac*Jac[2];
      Jacinv[3]=1.0/detJac*Jac[0];

      /* ddet_dx= derivative of the |detJac| wrt node positions */
      for (i=0; i<bas_n1*dim; i++)
	ddet_dx[i]=0;
      for (l=0; l<bas_n1; l++)
	for (t=0; t<dim; t++)
	  for (s1=0; s1<dim; s1++)
	    ddet_dx[l*dim+t]+= Jacinv[s1*dim+t]
	      *gradp1[0*bas_n1*dim+l*dim+s1];
      for (i=0; i<bas_n1*dim; i++)
	ddet_dx[i]*=fabs(detJac);


      /* Jinvgrad1= Jacinv * gradphi1[k,:,:]
	 (=real world gradient T1 = const on each element )
      */
      for (i=0; i<dim; i++)
	{
	  for (j=0; j<bas_n1; j++)
	    {
	      Jinvgrad1[j*dim+i]=0.0;
	      for (l=0; l<dim; l++)
		{
		  Jinvgrad1[j*dim+i]+= Jacinv[l*dim+i]
		    * gradp1[0*bas_n1*dim+j*dim+l];
		}
	    }
	}

      /* loop over all integration points */
      for (k=0; k<iform.num_points; k++)
	{
	  /* Jinvgrad2= Jacinv * gradphi2[k,:,:]
	     (=real world gradient T2 = linear function on each element)
	  */
	  for (i=0; i<dim; i++)
	    {
	      for (j=0; j<bas_n2; j++)
		{
		  Jinvgrad2[j*dim+i]=0.0;
		  for (l=0; l<dim; l++)
		    {
		      Jinvgrad2[j*dim+i]+= Jacinv[l*dim+i]
			* gradp2[k*bas_n2*dim+j*dim+l];
		    }
		}
	    }


	  /* derivative of the real world gradient wrt the node positions */
	  for (n=0; n<bas_n2; n++)
	    for (m=0; m<dim; m++)
	      for (t=0; t<dim; t++)
		for (l=0; l<bas_n1; l++)
		  {
		    eldgphi_dx[n*dim*dim*bas_n1+m*dim*bas_n1+t*bas_n1+l]
		      = - Jinvgrad1[l*dim+m]*Jinvgrad2[n*dim+t];
		  }

	  /* pressure at the integration point: */
	  elp=0.0;
	  for(s1=0; s1<bas_n1; s1++)
	    elp+=(*sol).V[elpdofs[s1]]
	      * phi1[k*bas_n1+s1];


	  /* velocity at the integration point: */
	  for (m=0; m<dim; m++)
	    {
	      elu[m]=0.0;
	      for(s1=0; s1<bas_n2; s1++)
		elu[m]+=(*sol).V[dofs2[m*bas_n2+s1]]
		  * phi2[k*bas_n2+s1];
	    }


	  /* velocity gradient at the integration point: */
	  for (i=0; i<dim; i++)
	    for (m=0; m<dim; m++)
	      {
		elgu[i*dim+m]=0.0;
		for(s1=0; s1<bas_n2; s1++)
		  elgu[i*dim+m]+= Jinvgrad2[s1*dim+m]
		    *(*sol).V[dofs2[i*bas_n2+s1]];
	      }

	  /* derivative of the velocity gradient wrt node positions */
	  for (i=0; i<dim; i++)
	    for (m=0; m<dim; m++)
	      for (t=0; t<dim; t++)
		for (l=0; l<bas_n1; l++)
		  {
		    eldgu_dx[i*dim*dim*bas_n1+m*dim*bas_n1+t*bas_n1+l]
		      = 0.0;
		    for (s1=0; s1<bas_n2; s1++)
		      {
			eldgu_dx[i*dim*dim*bas_n1+m*dim*bas_n1+t*bas_n1+l]
			  += eldgphi_dx[s1*dim*dim*bas_n1+m*dim*bas_n1
				     +t*bas_n1+l]
			  *(*sol).V[dofs2[i*bas_n2+s1]];
		      }
		  }


	  /* now all the incredients are ready, compute the
	     derivatives */

	  /* velocity components */
	  for (i=0; i<dim; i++)
	    for (j=0; j<bas_n2; j++)
	      for (t=0; t<dim; t++)
		for (l=0; l<bas_n1; l++)
		  {
		    /* the a(u,v[i,j]) (viscous) part */
		    /* nu*weight of this integration point */
		    weight=nu*iform.weights[k];
		    for (s1=0; s1<dim; s1++)
		      {
			eldRdxv[i*bas_n2*dim*bas_n1
				+j*dim*bas_n1+t*bas_n1+l] +=
			  weight * fabs(detJac)
			  *( eldgu_dx[i*dim*dim*bas_n1
				      +s1*dim*bas_n1+t*bas_n1+l]
			     * Jinvgrad2[j*dim+s1]
			    +elgu[i*dim+s1]
			     * eldgphi_dx[j*dim*dim*bas_n1+s1*dim*bas_n1
					  +t*bas_n1+l] )
			  + weight
			  *elgu[i*dim+s1]*Jinvgrad2[j*dim+s1]
			  *ddet_dx[l*dim+t];
		      }

		    /* the c(u,u,v[i,j]) (trilinear) part */
		    weight=iform.weights[k];
		    for (s1=0; s1<dim; s1++)
		      {
			eldRdxv[i*bas_n2*dim*bas_n1
				+j*dim*bas_n1+t*bas_n1+l] +=
			  weight * phi2[k*bas_n2+j] * elu[s1] *
			  ( eldgu_dx[i*dim*dim*bas_n1
				     +s1*dim*bas_n1+t*bas_n1+l]
			    * fabs(detJac)
			    + elgu[i*dim+s1]*ddet_dx[l*dim+t] );
		      }

		    /* the b(v[i,j],p) (pressure) part */
		    /* pressure*weight of this integration point */
		    weight=elp*iform.weights[k];
		    eldRdxv[i*bas_n2*dim*bas_n1
			    +j*dim*bas_n1+t*bas_n1+l]
		      -= weight
		      *( eldgphi_dx[j*dim*dim*bas_n1+i*dim*bas_n1
				    +t*bas_n1+l]
			 * fabs(detJac)
			 + Jinvgrad2[j*dim+i]*ddet_dx[l*dim+t]);
		  }

	  /* pressure components */
	  for (j=0; j<bas_n1; j++)
	    for (t=0; t<dim; t++)
	      for (l=0; l<bas_n1; l++)
		{
		  /* the b(u,q[j]) (divergence) part */
		  /* weight*q[j] of this integration point */
		  weight=phi1[k*bas_n1+j]*iform.weights[k];
		  for (s1=0; s1<dim; s1++)
		    {
		      eldRdxp[j*dim*bas_n1+t*bas_n1+l]
			-= weight *
			( eldgu_dx[s1*dim*dim*bas_n1
				   +s1*dim*bas_n1+t*bas_n1+l]
			  * fabs(detJac)
			  + elgu[s1*dim+s1]
			  * ddet_dx[l*dim+t] );
		    }
		}

	  /* last row of the residual */
	  weight=iform.weights[k];
	  for (t=0; t<dim; t++)
	    for (l=0; l<bas_n1; l++)
	      {
		eldwdx[t*bas_n1+l]
		  += weight * ddet_dx[l*dim+t];
	      }
	  for (j=0; j<bas_n1; j++)
	    {
	      sumw+=weight * fabs(detJac);
	    }

#warning navsto_Psi_dRdF_t21: no rhs assembly yet!
	} /* end loop over all integration points */
      
      /***************************************************************
       * the element wise derivatives are ready, now compute the     *
       * local contributions to Psi_dRdF                             *
       ***************************************************************/

      /* velocity-space residual */
      for (r=0; r<dim; r++) /* r-th dimension vel resisual */ 
	for (j=0; j<bas_n2; j++) /* j-th row of the vel res of this d */
	  for (t=0; t<dim; t++)  /* t-th dimension of the node deriv */
	    for (k=0; k<bas_n1; k++) /* k-th node deriv */
	      {
		Psi_dRdF->V[t*vx_nr+dofs1[k]]+=
		  Psi->V[r*vx_nr+dofs2[j]]
		  * eldRdxv[r*bas_n2*dim*bas_n1 + j*dim*bas_n1
			    + t*bas_n1 + k];
	      }

      /* pressure-space residual */
      for (j=0; j<bas_n1; j++) /* j-th row of the p-res */
	for (t=0; t<dim; t++)  /* t-th dimension of the node deriv */
	  for (k=0; k<bas_n1; k++) /* k-th node deriv */
	      {
		Psi_dRdF->V[t*vx_nr+dofs1[k]]+=
		  Psi->V[elpdofs[j]]
		  * eldRdxp[j*dim*bas_n1 + t*bas_n1 + k];
	      }

      
      /* stuff for the derivative of the last row */
      for (t=0; t<dim; t++)
	for (i=0; i<bas_n1; i++)
	  {
	    sumpdwdF[t*vx_nr+dofs1[i]]+=
	      sol->V[elpdofs[i]]*eldwdx[t*bas_n1+i];
	  }

    } /* end loop over all elements */

#ifndef NEW_ADJOINT
  /* stuff that is only required by the old approach */
  /* the derivative of the last row itself */
  weight = Psi_n/sumw;
  for (i=0; i<dim*vx_nr; i++)
    {
      Psi_dRdF->V[i] += weight*sumpdwdF[i];
    }
  

  /* can not easily undo changes to Psi at the dirichlet nodes, but
     can easily undo changes to last row */
  Psi->V[bigN-1] = Psi_n;
#endif
  
  /* free local data */
  free(sumpdwdF);

  free(elpdofs);
  free(dofs2);
  free(dofs1);

  free(eldwdx);
  free(eldRdxp);
  free(eldRdxv);

  free(eldgu_dx);
  free(elgu);
  free(elu);

  free(eldgphi_dx);
  free(Jinvgrad2);
  free(Jinvgrad1);
  
  free(ddet_dx);
  free(Jacinv);
  free(Jac);

  free_intdata (&iform);

  return SUCCESS;
}





/*FUNCTION*/
int navsto_dIdX_t21( struct mesh *msh, struct navsto_matrix *K,
		     struct vector *sol,
		     FIDX pc_nr, struct vector *pcvec,
		     struct vector *dIdsol, 
		     struct vector *dIdF,
		     struct vector *ref_sol
/* evaluates the performance criteria (given with the mesh) for the
   supplied solution vector sol, stores the values in pcvec

   Input:  msh       - the mesh
           K         - navsto_matrix struct, the pdof part is needed
           sol       - vector containing the solution of a Stokes or
                       Navier-Stokes problem, calculated using the
                       T2/T1 element pair
	   pc_nr     - number of performance criteria, has to be the
	               same as in the mesh file, pcvec, dIdsol and
	               dIdF have to provide space for at least pc_nr
	               criteria 
           ref_sol   - vector containing the reference solution, must
                       be compatible with sol (see above, same mesh)
		       if not required, ref_sol=NULL may me specified

   Output: pcvec     - vector containing the values of the performance
	               criteria, has to be allocated by the
	               calling routine, length >= pc_nr
	   dIdsol    - derivatives of the performance with respect to
	               the solution vector, array of px_nr vectors of
	               length bigN (bigN=sol.len), dIdsol[i].V[j] is
	               the derivative of the i-th component of the
	               performance vector wrt the j-th component of
	               the solution vector, not used if ==NULL
           dIdF      - derivatives of the performance with respect to
	               the node positions, array of pc_nr vectors of
	               length dim*vx_nr, dIdF[i].V[d*vx_nr+j] is the
	               derivative of the i-th component of the
	               performance vector wrt the d-th coordinate of
	               the j-th node, not used if ==NULL

   Return: SUCCESS - success
           FAIL    - failure, see error message, output will not be
                     valid
*/
			 ){
  FIDX pcsu, pcvo, el, i, j, k, l, m, n, r, s1, s2, t;
  int  err;

  FIDX dim=2, bas_n1, bas_n2, bas_n1d1, bas_n2d1,
    vx_nr, eg_nr, bd_nr, hi_nr, 
    el_w, vx_w, eg_w, fc_w, bd_w, ps_w, pv_w, pc_w, hi_w, fu_w,
    bigN;
  FIDX subtypes[2];

  double nu;             /* nondimensional parameter describing the
			    flow, nu= 1/Re, where Re is the Reynolds
			    number, thus nu=mu/(rho*U*d), where mu is
			    the viscosity of the fluid, rho the
			    density, U is the velocity scale (the
			    solution velocities relate to this scale),
			    d is the lenght scale (the scale by which
			    lengths in the mesh are given),
			    nu is taken from m.param[MC2XPANUPO] */

  struct int_data iform; /* integration formula 2d   */
  struct int_data iform1;/* integration formula 1d   */

  FIDX    *eldofs1, *eldofs2, *elpdofs;
  FIDX    *egdofs1, *egdofs2, *egpdofs;
                         /* degrees of freedom to which the local
			    matrices correspond */ 

  double *eldIdF;        /* local contribution to dIdF */
  double *eldIdu;        /* local contribution to dIdu */
  double *eldIdp;        /* local contribution to dIdp */

  /* at the current integration point k: */
  double *Jac, *Jacinv;  /* Jacobian of the element mapping and its
			    inverse */
  double detJac;         /* determinant of the Jacobian */
  double *ddet_dx;       /* derivative of |detJac| wrt node positions */
  double weight;         /* weight of the int. point times |detJac| */

  double *Jinvgrad1;     /* inverse Jacobian times gradphi1 */
  double *Jinvgrad2;     /* inverse Jacobian times gradphi2 */
  double *eldgphi_dx;    /* derivative of the real world gradient of
			    the quadratic basis wrt node positions */

  double elp;            /* pressure at the integration point */
  double *elu;           /* velocity at the integration point */
  double *elgu;          /* velocity gradient at the integration point */
  double *eldgu_dx;      /* derivative of the velocity gradient wrt
			    node positions */ 
  double el_ref_p;       /* reference pressure at the integration point */
  double *el_ref_u;      /* reference velocity at the integration point */

  double *elt;           /* tangential vector at the integration point */
  double *elnds;         /* (outw normal)*ds at the integration point */
  double *eldnds_dx;     /* d(n*ds)/dx at the integration point */

  struct vector hv1, hv2;/* help vectors used during evaluation of
			    some criteria, they will be allocated to
			    the same size as pcvec and hold temporary
			    values */
  
  FIDX pvx_nr;           /* number of pressure dofs */

  /* pointer to the integration data */
  double *phi1, *phi2, *gradp1, *gradp2,
    *phi1d1, *gradp1d1, *phi2d1, *gradp2d1,
    *phi1line, *gradp1line, *hessp1line,
    *phi2line, *gradp2line, *hessp2line, *pointsline; 

  FIDX eg, pccrit, pctype, needgrad, edge_elem, orie;
  double edge_elw, *pcdat;

  int tloop, max_vol_loops;


  /****************   init ******************************************/
  /* get integration formula */
  subtypes[0]=1;
  subtypes[1]=2;

  err=cubature_bases( dim, 4, tria, 2, subtypes, &iform); 
  FUNCTION_FAILURE_HANDLE( err, cubature_bases, navsto_dIdX_t21);

  err=cubature_bases( dim-1, 3, inter, 2, subtypes, &iform1); 
  FUNCTION_FAILURE_HANDLE( err, cubature_bases, navsto_dIdX_t21);

  /* make phi and gradphi better accessible */
  phi1   = (iform.bases[0]->phi);
  gradp1 = (iform.bases[0]->gradphi);
  bas_n1 = (iform.bases[0]->num_basis);
  phi2   = (iform.bases[1]->phi);
  gradp2 = (iform.bases[1]->gradphi);
  bas_n2 = (iform.bases[1]->num_basis);
  phi1d1   = (iform1.bases[0]->phi);
  gradp1d1 = (iform1.bases[0]->gradphi);
  bas_n1d1 = (iform1.bases[0]->num_basis);
  phi2d1   = (iform1.bases[1]->phi);
  gradp2d1 = (iform1.bases[1]->gradphi);
  bas_n2d1 = (iform1.bases[1]->num_basis);

  /* We have to generate a integration formula for the gradients on
     lines as well, as this is not covered by those above. Luckily
     this is quite simple, as we can change the 1d points from iform1
     to 2d points and get phi2line and gradp2line. The weights stay
     the same. */
  TRY_MALLOC( pointsline, iform1.num_points*dim, double,
	      navsto_dIdX_t21);
  for (i=0; i<iform1.num_points; i++)
    {
      pointsline[i*dim+0]=iform1.points[i];
      pointsline[i*dim+1]=0.0;
    }
  err=eval_basis( dim, tria, 2, iform1.num_points, pointsline, 
		  &i, &phi2line, &gradp2line, &hessp2line);
  FUNCTION_FAILURE_HANDLE( err, eval_basis, navsto_dIdX_t21);
  err=eval_basis( dim, tria, 1, iform1.num_points, pointsline, 
		  &i, &phi1line, &gradp1line, &hessp1line);
  FUNCTION_FAILURE_HANDLE( err, eval_basis, navsto_dIdX_t21);
  /* now this defines the desired formula */

  vx_nr = (*msh).vx_nr;
  eg_nr = (*msh).eg_nr;
  bd_nr = (*msh).bd_nr;
  hi_nr = (*msh).hi_nr;
  vx_w  = (*msh).vx_w;
  el_w  = (*msh).el_w;
  eg_w  = (*msh).eg_w;
  fc_w  = (*msh).fc_w;
  bd_w  = (*msh).bd_w;
  ps_w  = (*msh).ps_w;
  pv_w  = (*msh).pv_w;
  pc_w  = (*msh).pc_w;
  hi_w  = (*msh).hi_w;
  fu_w  = (*msh).fu_w;

  nu    = (*msh).para[MC2XPANUPO];
  pvx_nr= (*K).pvx_nr;
  bigN  = dim*vx_nr+pvx_nr;

  if (dim > 2)
    {
      /* cry */
      fprintf(stderr,
	      "navsto_dIdX_t21: dim >2 not implemented "
	      "(Jacinv, lineint)\n");
      return FAIL;
    }

  if ((ref_sol!=NULL)&&( (*ref_sol).len!=bigN))
    {
      /* cry */
      fprintf(stderr,
	      "navsto_dIdX_t21: reference solution ref_sol does not fit to mesh"
	      "(size)\n");
      return FAIL;
    }


  if ((bas_n2<3)||(bas_n2d1!=3))
    {
      /* not supported, see definition of dofs2 in surface integral */
      fprintf(stderr,
	      "navsto_dIdX_t21: bas_n2<3 or bas_n2d1!=3 ???????\n");
      return FAIL;
    }
 

  if ((pc_nr!=(*msh).pc_nr)||(pcvec->len != pc_nr))
    {
      /* cry */
      fprintf(stderr,
	      "navsto_dIdX_t21: pc_nr dosn't match the mesh or"
	      " pcvec has wrong length\n");
      return FAIL;
    }

  if (dIdF!=NULL)
    for (i=0; i<pc_nr; i++)
      if (dIdF[i].len != dim*vx_nr)
	{
	  /* cry */
	  fprintf(stderr, "navsto_dIdX_t21: "
		  "dimension of dIdF[.] !=dim*vx_nr\n");
	  return FAIL;
	}
  if (dIdsol!=NULL)
    for (i=0; i<pc_nr; i++)
      if (dIdsol[i].len != bigN)
	{
	  /* cry */
	  fprintf(stderr, "navsto_dIdX_t21: "
		  "dimension of dIdsol[.] != bigN\n");
	  return FAIL;
	}

  /* allocate memory for the matrices/vectors on the element */
  TRY_MALLOC( Jac, dim*dim, double, navsto_dIdX_t21);
  TRY_MALLOC( Jacinv, dim*dim, double, navsto_dIdX_t21);
  TRY_MALLOC( ddet_dx, dim*bas_n1, double, navsto_dIdX_t21);

  TRY_MALLOC( Jinvgrad1, dim*bas_n1, double, navsto_dIdX_t21);
  TRY_MALLOC( Jinvgrad2, dim*bas_n2, double, navsto_dIdX_t21);
  TRY_MALLOC( eldgphi_dx, bas_n2*dim*dim*bas_n1, double, navsto_dIdX_t21);

  TRY_MALLOC( elu, dim, double, navsto_dIdX_t21);
  TRY_MALLOC( elgu, dim*bas_n2, double, navsto_dIdX_t21);
  TRY_MALLOC( eldgu_dx, dim*dim*dim*bas_n1, double, navsto_dIdX_t21);

  TRY_MALLOC( elt, dim, double, navsto_dIdX_t21);
  TRY_MALLOC( elnds, dim, double, navsto_dIdX_t21);
  TRY_MALLOC( eldnds_dx, dim*dim*bas_n1, double, navsto_dIdX_t21);

  TRY_MALLOC( eldIdF, dim*bas_n1, double, navsto_dIdX_t21);
  TRY_MALLOC( eldIdu, dim*bas_n2, double, navsto_dIdX_t21);
  TRY_MALLOC( eldIdp, bas_n1, double, navsto_dIdX_t21);

  TRY_MALLOC( egdofs1, dim*bas_n1, FIDX, navsto_dIdX_t21);
  TRY_MALLOC( egdofs2, dim*bas_n2, FIDX, navsto_dIdX_t21);
  TRY_MALLOC( egpdofs, bas_n1, FIDX, navsto_dIdX_t21);

  TRY_MALLOC( eldofs1, dim*bas_n1, FIDX, navsto_dIdX_t21);
  TRY_MALLOC( eldofs2, dim*bas_n2, FIDX, navsto_dIdX_t21);
  TRY_MALLOC( elpdofs, bas_n1, FIDX, navsto_dIdX_t21);

  if (ref_sol!=NULL)
    {
      TRY_MALLOC( el_ref_u, dim, double, navsto_dIdX_t21);
    }
  else 
    {
      el_ref_u=NULL;
    }

  err=vector_alloc(&hv1, pc_nr);
  FUNCTION_FAILURE_HANDLE( err, vector_alloc, navsto_dIdX_t21);
  err=vector_alloc(&hv2, pc_nr);
  FUNCTION_FAILURE_HANDLE( err, vector_alloc, navsto_dIdX_t21);

  /* clear the help vectors */
  for (i=0; i<pc_nr; i++) 
    {
      hv1.V[i]  = 0.0;
      hv2.V[i]  = 0.0;
    }

  /* clear the output arrays */
  for (i=0; i<pc_nr; i++) pcvec->V[i]  = 0.0;
  if (dIdF!=NULL)
    for (i=0; i<pc_nr; i++)
      for (j=0; j<dim*vx_nr; j++)
	dIdF[i].V[j] = 0.0;
  if (dIdsol!=NULL)
    for (i=0; i<pc_nr; i++)
      for (j=0; j<bigN; j++)
	dIdsol[i].V[j] = 0.0;


  /* loop over all pcsurf entries of the finest level*/
  pcsu=0;
  /* look for first entry of the finest level (the first with no childs) */
  while ((pcsu<(*msh).ps_nr)&&
	 ((*msh).edge[(*msh).pcsurf[pcsu*ps_w+MCXXPSSURF]*eg_w+MCT2EGCHL1]
	  !=-1)) pcsu++;
  while (pcsu<(*msh).ps_nr)
    {
      eg=(*msh).pcsurf[pcsu*ps_w+MCXXPSSURF];

      egdofs2[0       ] = (*msh).edge[eg*eg_w + MCT2EGNOD1  ];
      egdofs2[1       ] = (*msh).edge[eg*eg_w + MCT2EGNOD1+1];
      egdofs2[2       ] = (*msh).edge[eg*eg_w + MCT2EGNODM];
      for (i=1; i<dim; i++)
	{
	  egdofs2[i*bas_n2+0] = i*vx_nr + egdofs2[0];
	  egdofs2[i*bas_n2+1] = i*vx_nr + egdofs2[1];
	  egdofs2[i*bas_n2+2] = i*vx_nr + egdofs2[2];
	}

      egdofs1[0       ] = (*msh).edge[eg*eg_w + MCT2EGNOD1  ];
      egdofs1[1       ] = (*msh).edge[eg*eg_w + MCT2EGNOD1+1];
      for (i=1; i<dim; i++)
	{
	  egdofs1[i*bas_n1+0] = i*vx_nr + egdofs1[0];
	  egdofs1[i*bas_n1+1] = i*vx_nr + egdofs1[1];
	}


      egpdofs[0] = dim*vx_nr+(*K).pdof[egdofs1[0]];
      egpdofs[1] = dim*vx_nr+(*K).pdof[egdofs1[1]];

      orie = (*msh).pcsurf[pcsu*ps_w+MCXXPSORIE];

      pccrit=(*msh).pcsurf[pcsu*ps_w+MCXXPSCRIT];
      pcdat =&(*msh).pccrit[pccrit*pc_w+MCXXPCDAT1];
      pctype=(FIDX)(*msh).pccrit[pccrit*pc_w+MCXXPCTYPE];

      needgrad=0;
      switch(pctype)
	{
	case 1: /* surface force */
	  needgrad=1;
	  break;
	default:
	  fprintf(stderr,
		  "navsto_dIdX_t21: unknown pcsurf pccrit type!"
		  "(pccrit %d)\n", (int) pccrit);
	  return FAIL;
	}


      if (needgrad==1)
	{
	  edge_elem=0;
	  
#ifdef DEBUGFEINS
	  if ((*msh).pcsurf[pcsu*ps_w+MCXXPSVOLE+0]==-1)
	    {
	      fprintf(stderr,
		      "navsto_dIdX_t21: pcsurf w/o VOLE!\n");
	      return FAIL;
	    }
#endif

	  if ((*msh).pcsurf[pcsu*ps_w+MCXXPSVOLE+1]==-1)
	    {
	      /* this edge belongs to only 1 element, therefore the
		 elements contribution weight is one */
	      edge_elw=1.0;
	    }
	  else
	    {
	      /* this edge belongs to 2 elements, therefore the
		 elements contribution weight is 1/2 */
	      edge_elw=0.5;
	    }
	  
	  while ((edge_elem<2)&&
		 ((*msh).pcsurf[pcsu*ps_w+MCXXPSVOLE+edge_elem]!=-1))
	    {
	      el=(*msh).pcsurf[pcsu*ps_w+MCXXPSVOLE+edge_elem];

	      /* define the local eldofs such that they have the
		 egdofs for the local nodes 0, 1, 3 */
	      eldofs2[0]=egdofs2[0];
	      eldofs2[1]=egdofs2[1];
	      /* find the 3rd node in the element */
	      i=0;
	      while ((i<bas_n1)&&
		     ( ((*msh).elem[el*el_w+MCT2ELNOD1+i]==eldofs2[0])
		       ||((*msh).elem[el*el_w+MCT2ELNOD1+i]==eldofs2[1])) )
		{ i++; }
#ifdef DEBUGFEINS	      
	      if (i>=bas_n1)
		{
		  fprintf(stderr,"navsto_dIdX_t21: node not found\n");
		  return FAIL;
		}
	      eldofs2[3]=-1;
	      eldofs2[4]=-1;
	      eldofs2[5]=-1;
#endif
	      eldofs2[2]=(*msh).elem[el*el_w+MCT2ELNOD1+i];
	      /* now define eldof 3, 4, 5 as the according centernodes
	       */
	      for (i=0; i<3; i++)
		for (j=0; j<3; j++)
		  if ( (((*msh).elem[el*el_w+MCT2ELNOD1+j]
			 ==eldofs2[i])
			&&((*msh).elem[el*el_w+MCT2ELNOD1+(j+1)%3]
			   ==eldofs2[(i+1)%3]))
		       ||(((*msh).elem[el*el_w+MCT2ELNOD1+j]
			   ==eldofs2[(i+1)%3])
			  &&((*msh).elem[el*el_w+MCT2ELNOD1+(j+1)%3]
			     ==eldofs2[i])) )
		    {
		      /* local the elements edge j is local edge i */
		      eldofs2[3+i]=(*msh).elem[el*el_w+MCT2ELNOD1+3+j];
		    }
#ifdef DEBUGFEINS
	      /* check if we got all nodes */
	      if ((eldofs2[3]==-1)||((eldofs2[4]==-1)||(eldofs2[5]==-1)))
		{
		  fprintf(stderr,"navsto_dIdX_t21: node 345 not found\n");
		  return FAIL;
		}
#endif
	      /* define the remaining entries accordingly */
	      for (i=1; i<dim; i++)
		for (j=0; j<bas_n2; j++)
		  eldofs2[i*bas_n2+j]=i*vx_nr+eldofs2[j];
	      for (i=0; i<dim; i++)
		for (j=0; j<bas_n1; j++)
		  eldofs1[i*bas_n1+j]=i*vx_nr+eldofs2[j];
	      for (i=0; i<bas_n1; i++)
		elpdofs[i]=dim*vx_nr+(*K).pdof[eldofs1[i]];



	      /* set eldIdF to zero */
	      for (i=0; i<dim*bas_n1; i++) 
		{
		  eldIdF[i]=0.0;
		}
	      /* set eldIdu to zero */
	      for (i=0; i<dim*bas_n2; i++) 
		{
		  eldIdu[i]=0.0;
		}
	      /* set eldIdp to zero */
	      for (i=0; i<bas_n1; i++) 
		{
		  eldIdp[i]=0.0;
		}

	      /* compute the Jacobian of the linear isoparmetric
		 element mapping */
	      /* Jac=0 */
	      for (i=0; i<dim*dim; i++)
		Jac[i]=0.0;
	  
	      /* Jac = sum_{i=nodes} vertex(i)*gradphi1_i^T */
	      for (i=0;i<bas_n1; i++)
		{
		  for (j=0; j<dim; j++)
		    {
		      for (r=0; r<dim; r++)
			{
			  Jac[j*dim+r]+= 
			    (*msh).vertex[eldofs1[i]*vx_w+MCT2VXSTRT+j]
			    * gradp1[0*bas_n1*dim +i*dim +r];
			}
		    }
		}

	      /* get detJac */
	      detJac=Jac[0]*Jac[3]-Jac[2]*Jac[1];
	  
	      /* get Jacinv (here direct) */
	      Jacinv[0]=1.0/detJac*Jac[3];
	      Jacinv[1]=-1.0/detJac*Jac[1];
	      Jacinv[2]=-1.0/detJac*Jac[2];
	      Jacinv[3]=1.0/detJac*Jac[0];

	      /* ddet_dx= derivative of the |detJac| wrt node positions */
	      for (i=0; i<bas_n1*dim; i++)
		ddet_dx[i]=0;
	      for (l=0; l<bas_n1; l++)
		for (t=0; t<dim; t++)
		  for (s1=0; s1<dim; s1++)
		    ddet_dx[l*dim+t]+= Jacinv[s1*dim+t]
		      *gradp1line[0*bas_n1*dim+l*dim+s1];
	      for (i=0; i<bas_n1*dim; i++)
		ddet_dx[i]*=fabs(detJac);


	      /* Jinvgrad1= Jacinv * gradphi1[k,:,:]
		 (=real world gradient T1 = const on each element )
	      */
	      for (i=0; i<dim; i++)
		{
		  for (j=0; j<bas_n1; j++)
		    {
		      Jinvgrad1[j*dim+i]=0.0;
		      for (l=0; l<dim; l++)
			{
			  Jinvgrad1[j*dim+i]+= Jacinv[l*dim+i]
			    * gradp1line[0*bas_n1*dim+j*dim+l];
			}
		    }
		}


	      /* loop over all integration points */
	      for (k=0; k<iform1.num_points; k++)
		{
		  /* Jinvgrad2= Jacinv * gradphi2[k,:,:]
		     (=real world gradient T2 = linear function on
		     each element)
		  */
		  for (i=0; i<dim; i++)
		    {
		      for (j=0; j<bas_n2; j++)
			{
			  Jinvgrad2[j*dim+i]=0.0;
			  for (l=0; l<dim; l++)
			    {
			      Jinvgrad2[j*dim+i]+= Jacinv[l*dim+i]
				* gradp2line[k*bas_n2*dim+j*dim+l];
			    }
			}
		    }
		  /* derivative of the real world gradient wrt the
		     node positions */
		  for (n=0; n<bas_n2; n++)
		    for (m=0; m<dim; m++)
		      for (t=0; t<dim; t++)
			for (l=0; l<bas_n1; l++)
			  {
			    eldgphi_dx[n*dim*dim*bas_n1+m*dim*bas_n1
				       +t*bas_n1+l]
			      = - Jinvgrad1[l*dim+m]*Jinvgrad2[n*dim+t];
			  }

		  /* pressure at the integration point: */
		  elp=0.0;
		  for(s1=0; s1<bas_n1; s1++)
		    elp+=(*sol).V[elpdofs[s1]]
		      * phi1line[k*bas_n1+s1];

		  
		  /* velocity at the integration point: */
		  for (m=0; m<dim; m++)
		    {
		      elu[m]=0.0;
		      for(s1=0; s1<bas_n2; s1++)
			elu[m]+=(*sol).V[eldofs2[m*bas_n2+s1]]
			  * phi2line[k*bas_n2+s1];
		    }


		  /* velocity gradient at the integration point: */
		  for (i=0; i<dim; i++)
		    for (m=0; m<dim; m++)
		      {
			elgu[i*dim+m]=0.0;
			for(s1=0; s1<bas_n2; s1++)
			  elgu[i*dim+m]+= Jinvgrad2[s1*dim+m]
			    *(*sol).V[eldofs2[i*bas_n2+s1]];
		      }
		  
		  /* derivative of the velocity gradient wrt node
		     positions */
		  for (i=0; i<dim; i++)
		    for (m=0; m<dim; m++)
		      for (t=0; t<dim; t++)
			for (l=0; l<bas_n1; l++)
			  {
			    eldgu_dx[i*dim*dim*bas_n1+m*dim*bas_n1
				     +t*bas_n1+l]
			      = 0.0;
			    for (s1=0; s1<bas_n2; s1++)
			      {
				eldgu_dx[i*dim*dim*bas_n1+m*dim*bas_n1
					 +t*bas_n1+l]
				  += eldgphi_dx[s1*dim*dim*bas_n1
						+m*dim*bas_n1
						+t*bas_n1+l]
				  *(*sol).V[eldofs2[i*bas_n2+s1]];
			      }
			  }

		  /* get tangential vector t and vector u at this
		     integration point */
		  for (j=0; j<dim; j++)
		    {
		      elt[j]=(*msh).vertex[egdofs1[1]*vx_w+MCT2VXSTRT+j]
			-(*msh).vertex[egdofs1[0]*vx_w+MCT2VXSTRT+j];
		    }

		  /* normal vector n*ds= orientation*
		     rotmatrix(pi/2)*t */
		  /* dim==2 only */
		  elnds[0]= -orie*elt[1];
		  elnds[1]=  orie*elt[0];

		  /* derivative of elnds wrt node positions */
		  for (i=0; i<dim*dim*bas_n1; i++)
		    eldnds_dx[i]=0.0;
		  /* d nds[1]/ d x^(0)_0 = */
		  eldnds_dx[1*dim*bas_n1 + 0*bas_n1 + 0] = -orie;
		  /* d nds[0]/ d x^(0)_1 = */
		  eldnds_dx[0*dim*bas_n1 + 1*bas_n1 + 0] = orie;
		  /* d nds[1]/ d x^(1)_0 = */
		  eldnds_dx[1*dim*bas_n1 + 0*bas_n1 + 1] = orie;
		  /* d nds[0]/ d x^(1)_1 = */
		  eldnds_dx[0*dim*bas_n1 + 1*bas_n1 + 1] = -orie;


		  /* now all the incredients are ready, compute the
		     criteria and their derivatives */
		  switch(pctype)
		    {
		    case 1: /* surface force */
		      /* F=int_S[ mu*( (grad u) + (grad u)^T)*n -p*n ]
		       */ 
		      weight= edge_elw*iform1.weights[k];
		      /* viscous force * weight vector */
		      for (i=0; i<dim; i++)
			{
			  for (j=0; j<dim; j++)
			    {
			      pcvec->V[pccrit]+=weight * nu
				* pcdat[i]
				* (elgu[i*dim+j]+elgu[j*dim+i])
				* elnds[j];
			    }
			}
		      /* pressure force * weight vector */
		      for (i=0; i<dim; i++)
			{
			  pcvec->V[pccrit]-=weight
			    * pcdat[i] * elp * elnds[i];
			}

		      /* dIdF */
		      for (t=0; t<dim; t++)
			for (l=0; l<bas_n1; l++)
			  {
			    /* viscous part */
			    for (s1=0; s1<dim; s1++)
			      for (s2=0; s2<dim; s2++)
				{
				  eldIdF[t*bas_n1+l]+=
				    weight*nu*pcdat[s1]
				    *( ( eldgu_dx[s1*dim*dim*bas_n1+
						  s2*dim*bas_n1+t*bas_n1+l]
					 +eldgu_dx[s2*dim*dim*bas_n1+
						   s1*dim*bas_n1+t*bas_n1+l]
					 )* elnds[s2]
				       +(elgu[s1*dim+s2]+elgu[s2*dim+s1])
				       *eldnds_dx[s2*dim*bas_n1+t*bas_n1+l]
				       );
				}
			    /* pressure part */
			    for (s1=0; s1<dim; s1++)
			      {
				eldIdF[t*bas_n1+l]
				  -=weight*elp*pcdat[s1]
				  *eldnds_dx[s1*dim*bas_n1+t*bas_n1+l];
			      }
			  }

		      /* dIdu */
		      for (i=0; i<dim; i++)
			for (j=0; j<bas_n2; j++)
			  for (s1=0; s1<dim; s1++)
			    {
			      eldIdu[i*bas_n2+j]+=
				nu*weight*Jinvgrad2[j*dim+s1]
				*(pcdat[s1]*elnds[i]
				  +pcdat[i]*elnds[s1]);
			    }

		      /* dIdp */
		      for (j=0; j<bas_n1; j++)
			for (s1=0; s1<dim; s1++)
			  {
			    eldIdp[j]
			      -= weight*phi1line[k*bas_n1+j]
			      *pcdat[s1]*elnds[s1];
			  }
		      break;
		    default:
		      fprintf(stderr, "navsto_dIdX_t21: "
			      "unknown pcsurf pccrit type!(pccrit %d)\n",
			      (int) pccrit);
		      return FAIL;
		    }
		} /* end loop over all integration points */

	      /* elementwise derivatives ready, add to global ones */

	      /* dIdF */
	      if (dIdF!=NULL)
		{
		  for (t=0; t<dim; t++)
		    for (l=0; l<bas_n1; l++)
		      dIdF[pccrit].V[t*vx_nr+eldofs1[l]]
			+=eldIdF[t*bas_n1+l];
		}

	      /* dIdsol */
	      if (dIdsol!=NULL)
		{
		  /* dIdu */
		  for (i=0; i<dim; i++)
		    for (j=0; j<bas_n2; j++)
		      dIdsol[pccrit].V[i*vx_nr+eldofs2[j]]
			+=eldIdu[i*bas_n2+j];
		  
		  /* dIdp */
		  for (j=0; j<bas_n1; j++)
		    dIdsol[pccrit].V[elpdofs[j]]
		      +=eldIdp[j];
		}

	      /* try next element side */
	      edge_elem++;
	    } /* end this element side of the pcsurf */

	} /* end needgrad */

      /* look for next entry with no childs */
      pcsu++;
      while ((pcsu<(*msh).ps_nr)&&
	     ((*msh).edge[(*msh).pcsurf[pcsu*ps_w+MCXXPSSURF]*eg_w
			  +MCT2EGCHL1] !=-1)) pcsu++;

    }/* end loop over all finest level pcsurf */

  /* set max_vol_loops to one, if there are more required they will be
     set by an criterion */
  max_vol_loops = 1;

  /* loop over all pcvol entries, max_vol_loops times */
  for (tloop=0; tloop<max_vol_loops; tloop++)
    for (pcvo=0; pcvo<(*msh).pv_nr; pcvo++)
      {
	el=(*msh).pcvol[pcvo*pv_w+MCXXPVVOLM];

	pccrit=(*msh).pcvol[pcvo*pv_w+MCXXPVCRIT];
	pctype=(FIDX)(*msh).pccrit[pccrit*pc_w+MCXXPCTYPE];
	pcdat =&(*msh).pccrit[pccrit*pc_w+MCXXPCDAT1];

	/* define the local dofs */
	for (i=0; i<dim; i++)
	  for (j=0; j<bas_n1; j++)
	    eldofs1[i*bas_n1+j]=i*vx_nr+(*msh).elem[el*el_w+MCT2ELNOD1+j];
	for (i=0; i<dim; i++)
	  for (j=0; j<bas_n2; j++)
	    eldofs2[i*bas_n2+j]=i*vx_nr+(*msh).elem[el*el_w+MCT2ELNOD1+j];
	for (i=0; i<bas_n1; i++)
	  elpdofs[i]=dim*vx_nr+(*K).pdof[eldofs1[i]];

	/* set eldIdF to zero */
	for (i=0; i<dim*bas_n1; i++) 
	  {
	    eldIdF[i]=0.0;
	  }
	/* set eldIdu to zero */
	for (i=0; i<dim*bas_n2; i++) 
	  {
	    eldIdu[i]=0.0;
	  }
	/* set eldIdp to zero */
	for (i=0; i<bas_n1; i++) 
	  {
	    eldIdp[i]=0.0;
	  }

	/* compute the Jacobian of the linear isoparmetric element
	   mapping */
	/* Jac=0 */
	for (i=0; i<dim*dim; i++)
	  Jac[i]=0.0;
	  
	/* Jac = sum_{i=nodes} vertex(i)*gradphi1_i^T */
	for (i=0;i<bas_n1; i++)
	  {
	    for (j=0; j<dim; j++)
	      {
		for (r=0; r<dim; r++)
		  {
		    Jac[j*dim+r]+= 
		      (*msh).vertex[eldofs1[i]*vx_w+MCT2VXSTRT+j]
		      * gradp1[0*bas_n1*dim +i*dim +r];
		  }
	      }
	  }

	/* get detJac */
	detJac=Jac[0]*Jac[3]-Jac[2]*Jac[1];
	  
	/* get Jacinv (here direct) */
	Jacinv[0]=1.0/detJac*Jac[3];
	Jacinv[1]=-1.0/detJac*Jac[1];
	Jacinv[2]=-1.0/detJac*Jac[2];
	Jacinv[3]=1.0/detJac*Jac[0];

	/* ddet_dx= derivative of the |detJac| wrt node positions */
	for (i=0; i<bas_n1*dim; i++)
	  ddet_dx[i]=0;
	for (l=0; l<bas_n1; l++)
	  for (t=0; t<dim; t++)
	    for (s1=0; s1<dim; s1++)
	      ddet_dx[l*dim+t]+= Jacinv[s1*dim+t]
		*gradp1[0*bas_n1*dim+l*dim+s1];
	for (i=0; i<bas_n1*dim; i++)
	  ddet_dx[i]*=fabs(detJac);


	/* Jinvgrad1= Jacinv * gradphi1[k,:,:]
	   (=real world gradient T1 = const on each element )
	*/
	for (i=0; i<dim; i++)
	  {
	    for (j=0; j<bas_n1; j++)
	      {
		Jinvgrad1[j*dim+i]=0.0;
		for (l=0; l<dim; l++)
		  {
		    Jinvgrad1[j*dim+i]+= Jacinv[l*dim+i]
		      * gradp1[0*bas_n1*dim+j*dim+l];
		  }
	      }
	  }

     
	/* loop over all integration points */
	for (k=0; k<iform.num_points; k++)
	  {


	    /* Jinvgrad2= Jacinv * gradphi2[k,:,:]
	       (=real world gradient T2 = linear function on each element)
	    */
	    for (i=0; i<dim; i++)
	      {
		for (j=0; j<bas_n2; j++)
		  {
		    Jinvgrad2[j*dim+i]=0.0;
		    for (l=0; l<dim; l++)
		      {
			Jinvgrad2[j*dim+i]+= Jacinv[l*dim+i]
			  * gradp2[k*bas_n2*dim+j*dim+l];
		      }
		  }
	      }


	    /* derivative of the real world gradient wrt the node positions */
	    for (n=0; n<bas_n2; n++)
	      for (m=0; m<dim; m++)
		for (t=0; t<dim; t++)
		  for (l=0; l<bas_n1; l++)
		    {
		      eldgphi_dx[n*dim*dim*bas_n1+m*dim*bas_n1+t*bas_n1+l]
			= - Jinvgrad1[l*dim+m]*Jinvgrad2[n*dim+t];
		    }

	    /* pressure at the integration point: */
	    elp=0.0;
	    for(s1=0; s1<bas_n1; s1++)
	      elp+=(*sol).V[elpdofs[s1]]
		* phi1[k*bas_n1+s1];


	    /* velocity at the integration point: */
	    for (m=0; m<dim; m++)
	      {
		elu[m]=0.0;
		for(s1=0; s1<bas_n2; s1++)
		  elu[m]+=(*sol).V[eldofs2[m*bas_n2+s1]]
		    * phi2[k*bas_n2+s1];
	      }


	    /* velocity gradient at the integration point: */
	    for (i=0; i<dim; i++)
	      for (m=0; m<dim; m++)
		{
		  elgu[i*dim+m]=0.0;
		  for(s1=0; s1<bas_n2; s1++)
		    elgu[i*dim+m]+= Jinvgrad2[s1*dim+m]
		      *(*sol).V[eldofs2[i*bas_n2+s1]];
		}

	    /* derivative of the velocity gradient wrt node positions */
	    for (i=0; i<dim; i++)
	      for (m=0; m<dim; m++)
		for (t=0; t<dim; t++)
		  for (l=0; l<bas_n1; l++)
		    {
		      eldgu_dx[i*dim*dim*bas_n1+m*dim*bas_n1+t*bas_n1+l]
			= 0.0;
		      for (s1=0; s1<bas_n2; s1++)
			{
			  eldgu_dx[i*dim*dim*bas_n1+m*dim*bas_n1+t*bas_n1+l]
			    += eldgphi_dx[s1*dim*dim*bas_n1+m*dim*bas_n1
					  +t*bas_n1+l]
			    *(*sol).V[eldofs2[i*bas_n2+s1]];
			}
		    }



	    /* reference solution velocity and pressure */
	    if (ref_sol!=NULL)
	      {
		/* pressure at the integration point: */
		el_ref_p=0.0;
		for(s1=0; s1<bas_n1; s1++)
		  el_ref_p+=(*ref_sol).V[elpdofs[s1]]
		    * phi1[k*bas_n1+s1];
		/* velocity at the integration point: */
		for (m=0; m<dim; m++)
		  {
		    el_ref_u[m]=0.0;
		for(s1=0; s1<bas_n2; s1++)
		  el_ref_u[m]+=(*ref_sol).V[eldofs2[m*bas_n2+s1]]
		    * phi2[k*bas_n2+s1];
		  }
	      }


	    /* now all the incredients are ready, compute the
	       criteria and their derivatives */
	    switch(pctype)
	      {
	      case 2: /* energy dissipation */
		/* y= (grad u)+(grad u)^T;
		   integrate y:y */
		if (tloop==0)
		  {
		    weight=iform.weights[k]*fabs(detJac)*nu/2;
		    for (i=0; i<dim; i++)
		      {
			for (j=0; j<dim; j++)
			  {
			    pcvec->V[pccrit]+=weight
			      * (elgu[i*dim+j]+elgu[j*dim+i])
			      * (elgu[i*dim+j]+elgu[j*dim+i]);
			  }
		      }

		    weight=iform.weights[k]*nu/2;
		    /* dIdF */
		    for (t=0; t<dim; t++)
		      for (l=0; l<bas_n1; l++)
			for (s1=0; s1<dim; s1++)
			  for (s2=0; s2<dim; s2++)
			    {
			      eldIdF[t*bas_n1+l]
				+= weight
				*( 2*(elgu[s1*dim+s2]+elgu[s2*dim+s1])
				   *(eldgu_dx[s1*dim*dim*bas_n1
					      +s2*dim*bas_n1+t*bas_n1+l]
				     +eldgu_dx[s2*dim*dim*bas_n1
					       +s1*dim*bas_n1+t*bas_n1+l]
				     )* fabs(detJac)
				   + (elgu[s1*dim+s2]+elgu[s2*dim+s1])
				   *(elgu[s1*dim+s2]+elgu[s2*dim+s1])
				   *ddet_dx[l*dim+t] );
			    }
		    /* dIdu */
		    for (i=0; i<dim; i++)
		      for (j=0; j<bas_n2; j++)
			for (s1=0; s1<dim; s1++)
			  {
			    eldIdu[i*bas_n2+j]+=
			      weight*fabs(detJac)*4
			      *(elgu[s1*dim+i]+elgu[i*dim+s1])
			      *Jinvgrad2[j*dim+s1];
			  }
		    /* dIdp=0 */
		  } /* end if tloop==0 */
		break;
	      case 3: /* mesh volume */
		if (tloop==0)
		  {
		    /* integrate 1 */
		    weight=iform.weights[k]*fabs(detJac);
		    pcvec->V[pccrit]+=weight;

		    weight=iform.weights[k];
		    /* dIdF */
		    for (t=0; t<dim; t++)
		      for (l=0; l<bas_n1; l++)
			{
			  eldIdF[t*bas_n1+l]
			    += weight*ddet_dx[l*dim+t];
			}
		    /* dIdu=0 */
		    /* dIdp=0 */
		  } /* end if tloop==0 */		  
		break;
	      case 4: /* mean value */
		if ((max_vol_loops<2)
		    &&((dIdF!=NULL)||(dIdsol!=NULL))) max_vol_loops=2;
		if (tloop==0) /* first loop, compute mean value */
		  {
		    const FIDX comp=(FIDX) pcdat[0];

		    weight=iform.weights[k]*fabs(detJac);
		    /* hv1=integral of the solution component */
		    if ((comp>=0)&&(comp<dim))
		      hv1.V[pccrit]+=weight*elu[comp];
		    else if (comp==dim)
		      hv1.V[pccrit]+=weight*elp;
		    else
		      {
			fprintf(stderr,"navsto_dIdX_t21: pccrit type 4,"
			      " invalid component!(pccrit %d)\n", 
				(int) pccrit);
			return FAIL;
		      }


		    /* hv2=integral over one */
		    hv2.V[pccrit]+=weight;
		  } /* end if tloop==0 */
		else if (tloop==1) /* second loop, the derivatives */
		  {
		    const FIDX comp=(FIDX) pcdat[0];
                    /* have now:
		         hv1=integral of the solution component,
			 hv2=integral over one 
		       only the derivatives need to be done in the
		       second loop */
		    /* hv1=integral of the solution component */
		    if ((comp>=0)&&(comp<dim))
		      {
			weight=iform.weights[k]*fabs(detJac)
			  *pcdat[1]/hv2.V[pccrit];
			/* dIdu */
			for (j=0; j<bas_n2; j++)
			  {
			    eldIdu[comp*bas_n2+j]+= weight
			      *phi2[k*bas_n2+j];
			  }

			weight=iform.weights[k]
			  *pcdat[1]/(hv2.V[pccrit]*hv2.V[pccrit]);
			/* dIdF */
			for (t=0; t<dim; t++)
			  for (l=0; l<bas_n1; l++)
			    {
			      eldIdF[t*bas_n1+l]
				+= weight
				*(elu[comp]*hv2.V[pccrit]
				  -hv1.V[pccrit]*1.0)*ddet_dx[l*dim+t];
			    }
		      }
		    else if (comp==dim)
		      {
			weight=iform.weights[k]*fabs(detJac)
			  *pcdat[1]/hv2.V[pccrit];
			/* dIdp */
			for (j=0; j<bas_n1; j++)
			  {
			    eldIdp[j]+= weight
			      *phi1[k*bas_n1+j];
			  }
			
			weight=iform.weights[k]
			  *pcdat[1]/(hv2.V[pccrit]*hv2.V[pccrit]);
			/* dIdF */
			for (t=0; t<dim; t++)
			  for (l=0; l<bas_n1; l++)
			    {
			      eldIdF[t*bas_n1+l]
				+= weight
				*(elp*hv2.V[pccrit]
				  -hv1.V[pccrit]*1.0)*ddet_dx[l*dim+t];
			    }
		      }
		    else
		      {
			fprintf(stderr,"navsto_dIdX_t21: pccrit type 4,"
			      " invalid component!(pccrit %d)\n",
				(int) pccrit);
			return FAIL;
		      }
		  } /* end if tloop==1 */
		break;
	      case 5: /* square of deviation from mean */
		if (max_vol_loops<2) max_vol_loops=2;
		if (tloop==0)
		  {
		    const FIDX comp=(FIDX) pcdat[0];

		    weight=iform.weights[k]*fabs(detJac);
		    /* hv1=integral of the solution component */
		    if ((comp>=0)&&(comp<dim))
		      hv1.V[pccrit]+=weight*elu[comp];
		    else if (comp==dim)
		      hv1.V[pccrit]+=weight*elp;
		    else
		      {
			fprintf(stderr,"navsto_dIdX_t21: pccrit type 5,"
			      " invalid component!(pccrit %d)\n", 
				(int) pccrit);
			return FAIL;
		      }


		    /* hv2=integral over one */
		    hv2.V[pccrit]+=weight;
		  } /* end if tloop==0 */
		else if (tloop==1)
		  {
		    const FIDX comp=(FIDX) pcdat[0];
                    /* have now:
		         hv1=integral of the solution component,
			 hv2=integral over one 
		       compute square of mean deviation and
		       derivatives in the second loop*/
		    if ((comp>=0)&&(comp<dim))
		      {
			double ldev=elu[comp]-hv1.V[pccrit]/hv2.V[pccrit];

			weight=iform.weights[k]*fabs(detJac)*pcdat[1];
			pcvec->V[pccrit]+=weight * ldev*ldev;

			weight=iform.weights[k]*fabs(detJac)
			  *pcdat[1] * 2.0 * ldev;
			/* dIdu */
			for (j=0; j<bas_n2; j++)
			  {
			    eldIdu[comp*bas_n2+j]+= weight
			      *phi2[k*bas_n2+j];
			  }

			weight=iform.weights[k]*pcdat[1];
			/* dIdF */
			for (t=0; t<dim; t++)
			  for (l=0; l<bas_n1; l++)
			    {
			      eldIdF[t*bas_n1+l]
				+= weight
				*(ldev*ldev)*ddet_dx[l*dim+t];
			    }
		      }
		    else if (comp==dim)
		      {
			double ldev=elp-hv1.V[pccrit]/hv2.V[pccrit];

			weight=iform.weights[k]*fabs(detJac)*pcdat[1];
			pcvec->V[pccrit]+=weight * ldev*ldev;

			weight=iform.weights[k]*fabs(detJac)
			  *pcdat[1] * 2.0 * ldev;
			/* dIdp */
			for (j=0; j<bas_n1; j++)
			  {
			    eldIdp[j]+= weight
			      *phi1[k*bas_n1+j];
			  }
			
			weight=iform.weights[k]*pcdat[1];
			/* dIdF */
			for (t=0; t<dim; t++)
			  for (l=0; l<bas_n1; l++)
			    {
			      eldIdF[t*bas_n1+l]
				+= weight
				*(ldev*ldev)*ddet_dx[l*dim+t];
			    }
		      }
		    else
		      {
			fprintf(stderr,"navsto_dIdX_t21: pccrit type 5,"
			      " invalid component!(pccrit %d)\n", 
				(int) pccrit);
			return FAIL;
		      }
		  } /* end if tloop==1 */
		break;
	      case 6: /* square of deviation from reference solution */
		if (tloop==0)
		  {
		    FIDX comp;
		    const FIDX comp1=(FIDX) pcdat[0];
		    const FIDX comp2=(FIDX) pcdat[1];

		    for (comp=comp1; comp<=comp2; comp++)
		      {
			if ((comp>=0)&&(comp<dim))
			  {
			    double ldev=elu[comp]-el_ref_u[comp];

			    weight=iform.weights[k]*fabs(detJac);
			    pcvec->V[pccrit]+=weight * ldev*ldev;

			    weight=iform.weights[k]*fabs(detJac) 
			      * 2.0 * ldev;
			    /* dIdu */
			    for (j=0; j<bas_n2; j++)
			      {
				eldIdu[comp*bas_n2+j]+= weight
				  *phi2[k*bas_n2+j];
			      }

			    /* dIdF not meaningful in this case*/
			  }
			else if (comp==dim)
			  {
			    double ldev=elp-el_ref_p;

			    weight=iform.weights[k]*fabs(detJac);
			    pcvec->V[pccrit]+=weight * ldev*ldev;

			    weight=iform.weights[k]*fabs(detJac)
			      * 2.0 * ldev;
			    /* dIdp */
			    for (j=0; j<bas_n1; j++)
			      {
				eldIdp[j]+= weight
				  *phi1[k*bas_n1+j];
			      }
			
			    /* dIdF not meaningful in this case*/
			  }
			else
			  {
			    fprintf(stderr,"navsto_dIdX_t21: pccrit type 6,"
				    " invalid component!(pccrit %d)\n", 
				    (int) pccrit);
			    return FAIL;
			  }
		      } /* end loop over components */
		  } /* end if tloop==0 */
		break;
	      default:
		fprintf(stderr,
			"navsto_dIdX_t21: unknown pcvol pccrit type!"
			"(pccrit %d)\n", (int) pccrit);
		return FAIL;
	      }
	  } /* end loop over all integration points */

	/* elementwise derivatives ready, add to global ones */
      
	/* dIdF */
	if (dIdF!=NULL)
	  {
	    for (t=0; t<dim; t++)
	      for (l=0; l<bas_n1; l++)
		dIdF[pccrit].V[t*vx_nr+eldofs1[l]]
		  +=eldIdF[t*bas_n1+l];
	  }

	/* dIdsol */
	if (dIdsol!=NULL)
	  {
	    /* dIdu */
	    for (i=0; i<dim; i++)
	      for (j=0; j<bas_n2; j++)
		dIdsol[pccrit].V[i*vx_nr+eldofs2[j]]
		  +=eldIdu[i*bas_n2+j];

	    /* dIdp */
	    for (j=0; j<bas_n1; j++)
	      dIdsol[pccrit].V[elpdofs[j]]
		+=eldIdp[j];
	  }
      
      } /* end loop over all pcvol entries */

  /* loop over all pccrit entries to finish some stuff */
  for (pccrit=0; pccrit<pc_nr; pccrit++)
    {
      pctype=(FIDX)(*msh).pccrit[pccrit*pc_w+MCXXPCTYPE];
      pcdat =&(*msh).pccrit[pccrit*pc_w+MCXXPCDAT1];

      switch(pctype)
	{
	case 1: /* surface force */
	  break; /* nothing to be done */
	case 2: /* energy dissipation */
	  break; /* nothing to be done */
	case 3: /* mesh volume */
	  break; /* nothing to be done */
	case 4: /* mean value */
	  /* I = c* int(u)/int(1) */
	  printf("int(u)=%e   int(1)=%e\n",hv1.V[pccrit],hv2.V[pccrit]);
	  pcvec->V[pccrit]=pcdat[1]*hv1.V[pccrit]/hv2.V[pccrit];
	  break;
	case 5: /* square deviation from mean value */
	  break; /* nothing to be done */
	case 6: /* square deviation from reference solution */
	  break; /* nothing to be done */
	default:
	  fprintf(stderr,
		  "navsto_dIdX_t21: unknown pcvol pccrit type!"
		  "(pccrit %d), final corrections\n", (int) pccrit);
	  return FAIL;
	}
    }

  /* free local data */
  vector_free(&hv1);
  vector_free(&hv2);

  free(elpdofs);
  free(eldofs2);
  free(eldofs1);

  free(egpdofs);
  free(egdofs2);
  free(egdofs1);

  free(eldIdp);
  free(eldIdu);
  free(eldIdF);

  free(eldnds_dx);
  free(elnds);
  free(elt);

  free(el_ref_u);

  free(eldgu_dx);
  free(elgu);
  free(elu);

  free(eldgphi_dx);
  free(Jinvgrad2);
  free(Jinvgrad1);
  
  free(ddet_dx);
  free(Jacinv);
  free(Jac);

  free(phi1line);
  free(gradp1line);
  free(hessp1line);
  free(phi2line);
  free(gradp2line);
  free(hessp2line);
  free(pointsline);
  free_intdata (&iform1);
  free_intdata (&iform);

  return SUCCESS;
}


/*FUNCTION*/
int navsto_perf_I_t21( struct mesh *msh, struct navsto_matrix *K,
		       struct vector *sol,
		       FIDX pc_nr, struct vector *pcvec,
 		       struct vector *dIdsol, 
	  	       struct vector *dIdF,
		       struct vector *ref_sol
/* evaluates the performance criteria (given with the mesh) for the
    supplied solution vector sol, stores the values in pcvec

    Input:  msh       - the mesh
            K         - navsto_matrix struct, the pdof part is needed
            sol       - vector containing the solution of a Stokes or
                        Navier-Stokes problem, calculated using the
                        T2/T1 element pair
	   pc_nr     - number of performance criteria, has to be the
	               same as in the mesh file, pcvec, dIdsol and
	               dIdF have to provide space for at least pc_nr
	               criteria
            ref_sol   - vector containing the reference solution, must
                        be compatible with sol (see above, same mesh)
		       if not required, ref_sol=NULL may me specified

    Output: pcvec     - vector containing the values of the performance
	               criteria, has to be allocated by the
	               calling routine, length >= pc_nr
           dIdF      - derivatives of the performance with respect to
	               the node positions, array of pc_nr vectors of
	               length dim*vx_nr, dIdF[i].V[d*vx_nr+j] is the
	               derivative of the i-th component of the
	               performance vector wrt the d-th coordinate of
	               the j-th node, not used if ==NULL

    Return: SUCCESS - success
            FAIL    - failure, see error message, output will not be
                      valid
*/

/*   AD:

 I = pcvec = vektor der abhaengigen
 sol = vektor der unabhaengigen     =>   dI/d sol

 sowie dI/d vertex 
 jeweils lokal (element- oder kantenweise) berechnen

*/

			 ){
   FIDX pcsu, pcvo, el, i, j, k, l, m, n, r, s1, s2, t;
   int  err;

   FIDX dim=2, bas_n1, bas_n2, bas_n1d1, bas_n2d1,
     vx_nr, eg_nr, bd_nr, hi_nr,
     el_w, vx_w, eg_w, fc_w, bd_w, ps_w, pv_w, pc_w, hi_w, fu_w,
     bigN;
   FIDX subtypes[2];

   double nu;             /* nondimensional parameter describing the
			    flow, nu= 1/Re, where Re is the Reynolds
			    number, thus nu=mu/(rho*U*d), where mu is
			    the viscosity of the fluid, rho the
			    density, U is the velocity scale (the
			    solution velocities relate to this scale),
			    d is the lenght scale (the scale by which
			    lengths in the mesh are given),
			    nu is taken from m.param[MC2XPANUPO] */

   struct int_data iform; /* integration formula 2d   */
   struct int_data iform1;/* integration formula 1d   */

   FIDX    *eldofs1, *eldofs2, *elpdofs;
   FIDX    *egdofs1, *egdofs2, *egpdofs;
                          /* degrees of freedom to which the local
			    matrices correspond */


   /* at the current integration point k: */
   adouble *Jac, *Jacinv;  /* Jacobian of the element mapping and its
			    inverse */
   adouble detJac;         /* determinant of the Jacobian */
   adouble weight;         /* weight of the int. point times |detJac| */

   adouble *Jinvgrad1;     /* inverse Jacobian times gradphi1 */
   adouble *Jinvgrad2;     /* inverse Jacobian times gradphi2 */

   adouble elp;            /* pressure at the integration point */
   adouble *elu;           /* velocity at the integration point */
   adouble *elgu;          /* velocity gradient at the integration point */

   double el_ref_p;        /* reference pressure at the integration point */
   double *el_ref_u;       /* reference velocity at the integration point */

   adouble *elt;           /* tangential vector at the integration point */
   adouble *elnds;         /* (outw normal)*ds at the integration point */

   double *hv1, *hv2;      /* help vectors used during evaluation of
			      some criteria, they will be allocated to
			      the same size as pcvec and hold temporary
			      values */
   double *dhv1_dsol, *dhv1_dF, *dhv2_dsol, *dhv2_dF;
                           /* derivatives of the help vectors
			      enties _dsol: 
			      [i*bigN+j] -> deriv of hv[i] wrt. sol[j]
			      enties _dF: 
			      [i*dim*vx_nr+j*dim+d] 
                                         -> deriv of hv[i] wrt. vertex[j*vx_w+d]
			    */


   FIDX pvx_nr;           /* number of pressure dofs */

   /* pointer to the integration data */
   double *phi1, *phi2, *gradp1, *gradp2,
     *phi1d1, *gradp1d1, *phi2d1, *gradp2d1,
     *phi1line, *gradp1line, *hessp1line,
     *phi2line, *gradp2line, *hessp2line, *pointsline;

   FIDX eg, pccrit, pctype, needgrad, edge_elem, orie;
   double edge_elw, *pcdat;

   int tloop, max_vol_loops;

   /* AD elem */
   adouble *avertex_loc;   /* avertex_loc[d*bas_n2+i]=vertex[eldofs[i]*vx_w+d] */
   adouble *asol_loc;      /* asol_loc[]=[sol[eldofs2[:],sol[elpdofs]]      */
   adouble acrit_loc;      /* active local criterion */
   double  crit_loc;       /* local criterion */
   /* adouble ahv1_loc;       /* local help-vector contribution */
   /* adouble ahv2_loc;       /* local help-vector contribution */
   /* AD elem */
       




   /****************   init ******************************************/
   /* get integration formula */
   subtypes[0]=1;
   subtypes[1]=2;

   err=cubature_bases( dim, 4, tria, 2, subtypes, &iform);
   FUNCTION_FAILURE_HANDLE( err, cubature_bases, navsto_perf_I_t21);

   err=cubature_bases( dim-1, 3, inter, 2, subtypes, &iform1);
   FUNCTION_FAILURE_HANDLE( err, cubature_bases, navsto_perf_I_t21);

   /* make phi and gradphi better accessible */
   phi1   = (iform.bases[0]->phi);
   gradp1 = (iform.bases[0]->gradphi);
   bas_n1 = (iform.bases[0]->num_basis);
   phi2   = (iform.bases[1]->phi);
   gradp2 = (iform.bases[1]->gradphi);
   bas_n2 = (iform.bases[1]->num_basis);
   phi1d1   = (iform1.bases[0]->phi);
   gradp1d1 = (iform1.bases[0]->gradphi);
   bas_n1d1 = (iform1.bases[0]->num_basis);
   phi2d1   = (iform1.bases[1]->phi);
   gradp2d1 = (iform1.bases[1]->gradphi);
   bas_n2d1 = (iform1.bases[1]->num_basis);

   /* We have to generate a integration formula for the gradients on
      lines as well, as this is not covered by those above. Luckily
      this is quite simple, as we can change the 1d points from iform1
      to 2d points and get phi2line and gradp2line. The weights stay
      the same. */
   TRY_MALLOC( pointsline, iform1.num_points*dim, double,
	      navsto_perf_I_t21);
   for (i=0; i<iform1.num_points; i++)
     {
       pointsline[i*dim+0]=iform1.points[i];
       pointsline[i*dim+1]=0.0;
     }
   err=eval_basis( dim, tria, 2, iform1.num_points, pointsline,
		  &i, &phi2line, &gradp2line, &hessp2line);
   FUNCTION_FAILURE_HANDLE( err, eval_basis, navsto_perf_I_t21);
   err=eval_basis( dim, tria, 1, iform1.num_points, pointsline,
		  &i, &phi1line, &gradp1line, &hessp1line);
   FUNCTION_FAILURE_HANDLE( err, eval_basis, navsto_perf_I_t21);
   /* now this defines the desired formula */

   vx_nr = (*msh).vx_nr;
   eg_nr = (*msh).eg_nr;
   bd_nr = (*msh).bd_nr;
   hi_nr = (*msh).hi_nr;
   vx_w  = (*msh).vx_w;
   el_w  = (*msh).el_w;
   eg_w  = (*msh).eg_w;
   fc_w  = (*msh).fc_w;
   bd_w  = (*msh).bd_w;
   ps_w  = (*msh).ps_w;
   pv_w  = (*msh).pv_w;
   pc_w  = (*msh).pc_w;
   hi_w  = (*msh).hi_w;
   fu_w  = (*msh).fu_w;

   nu    = (*msh).para[MC2XPANUPO];
   pvx_nr= (*K).pvx_nr;
   bigN  = dim*vx_nr+pvx_nr;

   if (dim > 2)
     {
       /* cry */
       fprintf(stderr,
	      "navsto_perf_I_t21: dim >2 not implemented "
	      "(Jacinv, lineint)\n");
       return FAIL;
     }

   if ((ref_sol!=NULL)&&( (*ref_sol).len!=bigN))
     {
       /* cry */
       fprintf(stderr,
	      "navsto_perf_I_t21: reference solution ref_sol does not fit to mesh"
	      "(size)\n");
       return FAIL;
     }


   if ((bas_n2<3)||(bas_n2d1!=3))
     {
       /* not supported, see definition of dofs2 in surface integral */
       fprintf(stderr,
	      "navsto_perf_I_t21: bas_n2<3 or bas_n2d1!=3 ???????\n");
       return FAIL;
     }


   if ((pc_nr!=(*msh).pc_nr)||(pcvec->len != pc_nr))
     {
       /* cry */
       fprintf(stderr,
	      "navsto_perf_I_t21: pc_nr dosn't match the mesh or"
	      " pcvec has wrong length\n");
       return FAIL;
     }

  if (dIdF!=NULL)
    for (i=0; i<pc_nr; i++)
      if (dIdF[i].len != dim*vx_nr)
	{
	  /* cry */
	  fprintf(stderr, "navsto_perf_I_t21: "
		  "dimension of dIdF[.] !=dim*vx_nr\n");
	  return FAIL;
	}

   /* allocate memory for the matrices/vectors on the element */
/*    TRY_MALLOC( Jac, dim*dim, double, navsto_perf_I_t21); */
/*    TRY_MALLOC( Jacinv, dim*dim, double, navsto_perf_I_t21); */

/*    TRY_MALLOC( Jinvgrad1, dim*bas_n1, double, navsto_perf_I_t21); */
/*    TRY_MALLOC( Jinvgrad2, dim*bas_n2, double, navsto_perf_I_t21); */

   Jac = new adouble[dim*dim];
   Jacinv = new adouble[dim*dim];

   Jinvgrad1 = new adouble[dim*bas_n1];
   Jinvgrad2 = new adouble[dim*bas_n2];

/*    TRY_MALLOC( elu, dim, double, navsto_perf_I_t21); */
   elu = new adouble[dim];
/*    TRY_MALLOC( elgu, dim*bas_n2, double, navsto_perf_I_t21); */
   elgu = new adouble[dim*bas_n2];

/*    TRY_MALLOC( elt, dim, double, navsto_perf_I_t21); */
/*    TRY_MALLOC( elnds, dim, double, navsto_perf_I_t21); */

   elt = new adouble[dim];
   elnds = new adouble[dim];

   TRY_MALLOC( egdofs1, dim*bas_n1, FIDX, navsto_perf_I_t21);
   TRY_MALLOC( egdofs2, dim*bas_n2, FIDX, navsto_perf_I_t21);
   TRY_MALLOC( egpdofs, bas_n1, FIDX, navsto_perf_I_t21);

   TRY_MALLOC( eldofs1, dim*bas_n1, FIDX, navsto_perf_I_t21);
   TRY_MALLOC( eldofs2, dim*bas_n2, FIDX, navsto_perf_I_t21);
   TRY_MALLOC( elpdofs, bas_n1, FIDX, navsto_perf_I_t21);

   avertex_loc = new adouble[dim*bas_n2];
   asol_loc    = new adouble[dim*bas_n2+bas_n1];


   if (ref_sol!=NULL)
     {
       TRY_MALLOC( el_ref_u, dim, double, navsto_perf_I_t21);
     }
   else
     {
       el_ref_u=NULL;
     }

   TRY_MALLOC( hv1, pc_nr, double, navsto_perf_I_t21);
   TRY_MALLOC( hv2, pc_nr, double, navsto_perf_I_t21);

   /* clear the help vectors */
   for (i=0; i<pc_nr; i++)
     {
       hv1[i]  = 0.0;
       hv2[i]  = 0.0;
     }

   if (dIdsol!=NULL)
     {
       /* initialise output */
       for (i=0; i<pc_nr; i++)
	 for (j=0; j<bigN; j++)
	   {
	     dIdsol[i].V[j] = 0.0;
	   }

       /* TRY_MALLOC( dhv1_dsol, pc_nr*bigN, double, navsto_perf_I_t21);
	  TRY_MALLOC( dhv2_dsol, pc_nr*bigN, double,
	  navsto_perf_I_t21); */
       /* clear the derivatives of help vectors */
       /* for (i=0; i<pc_nr*bigN; i++)
	  {
	  dhv1_dsol[i]  = 0.0;
	  dhv2_dsol[i]  = 0.0;
	  } */
     }
   if (dIdF!=NULL)
     {
       /* initialise output */
       for (i=0; i<pc_nr; i++)
	 for (j=0; j<dim*vx_nr; j++)
	   {
	     dIdF[i].V[j] = 0.0;
	   }

       /* TRY_MALLOC( dhv1_dF, pc_nr*dim*vx_nr, double, navsto_perf_I_t21);
	  TRY_MALLOC( dhv2_dF, pc_nr*dim*vx_nr, double,
	  navsto_perf_I_t21); */
       /* clear the derivatives of help vectors */
       /* for (i=0; i<pc_nr*dim*vx_nr; i++)
	  {
	  dhv1_dF[i]  = 0.0;
	  dhv2_dF[i]  = 0.0;
	  } */
     }



   /* clear the output arrays */
   for (i=0; i<pc_nr; i++) pcvec->V[i]  = 0.0;

   /* loop over all pcsurf entries of the finest level*/
   pcsu=0;
   /* look for first entry of the finest level (the first with no childs) */
   while ((pcsu<(*msh).ps_nr)&&
	 ((*msh).edge[(*msh).pcsurf[pcsu*ps_w+MCXXPSSURF]*eg_w+MCT2EGCHL1]
	  !=-1)) pcsu++;
   while (pcsu<(*msh).ps_nr)
     {
       eg=(*msh).pcsurf[pcsu*ps_w+MCXXPSSURF];

       egdofs2[0       ] = (*msh).edge[eg*eg_w + MCT2EGNOD1  ];
       egdofs2[1       ] = (*msh).edge[eg*eg_w + MCT2EGNOD1+1];
       egdofs2[2       ] = (*msh).edge[eg*eg_w + MCT2EGNODM];
       for (i=1; i<dim; i++)
	{
	  egdofs2[i*bas_n2+0] = i*vx_nr + egdofs2[0];
	  egdofs2[i*bas_n2+1] = i*vx_nr + egdofs2[1];
	  egdofs2[i*bas_n2+2] = i*vx_nr + egdofs2[2];
	}

       egdofs1[0       ] = (*msh).edge[eg*eg_w + MCT2EGNOD1  ];
       egdofs1[1       ] = (*msh).edge[eg*eg_w + MCT2EGNOD1+1];
       for (i=1; i<dim; i++)
	{
	  egdofs1[i*bas_n1+0] = i*vx_nr + egdofs1[0];
	  egdofs1[i*bas_n1+1] = i*vx_nr + egdofs1[1];
	}


       egpdofs[0] = dim*vx_nr+(*K).pdof[egdofs1[0]];
       egpdofs[1] = dim*vx_nr+(*K).pdof[egdofs1[1]];

       orie = (*msh).pcsurf[pcsu*ps_w+MCXXPSORIE];

       pccrit=(*msh).pcsurf[pcsu*ps_w+MCXXPSCRIT];
       pcdat =&(*msh).pccrit[pccrit*pc_w+MCXXPCDAT1];
       pctype=(FIDX)(*msh).pccrit[pccrit*pc_w+MCXXPCTYPE];

       needgrad=0;
       switch(pctype)
	{
	case 1: /* surface force */
	  needgrad=1;
	  break;
	default:
	  fprintf(stderr,
		  "navsto_perf_I_t21: unknown pcsurf pccrit type!"
		  "(pccrit %d)\n", (int) pccrit);
	  return FAIL;
	}


       if (needgrad==1)
	{
	  edge_elem=0;
	
#ifdef DEBUGFEINS
	  if ((*msh).pcsurf[pcsu*ps_w+MCXXPSVOLE+0]==-1)
	    {
	      fprintf(stderr,
		      "navsto_perf_I_t21: pcsurf w/o VOLE!\n");
	      return FAIL;
	    }
#endif

	  if ((*msh).pcsurf[pcsu*ps_w+MCXXPSVOLE+1]==-1)
	    {
	      /* this edge belongs to only 1 element, therefore the
		 elements contribution weight is one */
	      edge_elw=1.0;
	    }
	  else
	    {
	      /* this edge belongs to 2 elements, therefore the
		 elements contribution weight is 1/2 */
	      edge_elw=0.5;
	    }
	
	  while ((edge_elem<2)&&
		 ((*msh).pcsurf[pcsu*ps_w+MCXXPSVOLE+edge_elem]!=-1))
	    {
	      el=(*msh).pcsurf[pcsu*ps_w+MCXXPSVOLE+edge_elem];

	      /* define the local eldofs such that they have the
		 egdofs for the local nodes 0, 1, 3 */
	      eldofs2[0]=egdofs2[0];
	      eldofs2[1]=egdofs2[1];
	      /* find the 3rd node in the element */
	      i=0;
	      while ((i<bas_n1)&&
		     ( ((*msh).elem[el*el_w+MCT2ELNOD1+i]==eldofs2[0])
		       ||((*msh).elem[el*el_w+MCT2ELNOD1+i]==eldofs2[1])) )
		{ i++; }
#ifdef DEBUGFEINS	
	      if (i>=bas_n1)
		{
		  fprintf(stderr,"navsto_perf_I_t21: node not found\n");
		  return FAIL;
		}
	      eldofs2[3]=-1;
	      eldofs2[4]=-1;
	      eldofs2[5]=-1;
#endif
	      eldofs2[2]=(*msh).elem[el*el_w+MCT2ELNOD1+i];
	      /* now define eldof 3, 4, 5 as the according centernodes
	       */
	      for (i=0; i<3; i++)
		for (j=0; j<3; j++)
		  if ( (((*msh).elem[el*el_w+MCT2ELNOD1+j]
			 ==eldofs2[i])
			&&((*msh).elem[el*el_w+MCT2ELNOD1+(j+1)%3]
			   ==eldofs2[(i+1)%3]))
		       ||(((*msh).elem[el*el_w+MCT2ELNOD1+j]
			   ==eldofs2[(i+1)%3])
			  &&((*msh).elem[el*el_w+MCT2ELNOD1+(j+1)%3]
			     ==eldofs2[i])) )
		    {
		      /* local the elements edge j is local edge i */
		      eldofs2[3+i]=(*msh).elem[el*el_w+MCT2ELNOD1+3+j];
		    }
#ifdef DEBUGFEINS
	      /* check if we got all nodes */
	      if ((eldofs2[3]==-1)||((eldofs2[4]==-1)||(eldofs2[5]==-1)))
		{
		  fprintf(stderr,"navsto_perf_I_t21: node 345 not found\n");
		  return FAIL;
		}
#endif
	      /* define the remaining entries accordingly */
	      for (i=1; i<dim; i++)
		for (j=0; j<bas_n2; j++)
		  eldofs2[i*bas_n2+j]=i*vx_nr+eldofs2[j];
	      for (i=0; i<dim; i++)
		for (j=0; j<bas_n1; j++)
		  eldofs1[i*bas_n1+j]=i*vx_nr+eldofs2[j];
	      for (i=0; i<bas_n1; i++)
		elpdofs[i]=dim*vx_nr+(*K).pdof[eldofs1[i]];

	      trace_on(1);
	      /* set the input */
	      for (i=0; i<dim; i++)
		for (j=0; j<bas_n2; j++)
		  {
		    avertex_loc[i*bas_n2+j] 
		      <<= msh->vertex[eldofs2[j]*vx_w+MCT2VXSTRT +i];
		  }
	      for (i=0; i<dim; i++)
		for (j=0; j<bas_n2; j++)
		  {
		    asol_loc[i*bas_n2+j] <<= sol->V[eldofs2[i*bas_n2+j]];
		  }
	      for (i=0; i<bas_n1; i++)
		{
		  asol_loc[dim*bas_n2+i] <<= sol->V[elpdofs[i]];
		}
	      acrit_loc = 0.0;


	      /* compute the Jacobian of the linear isoparmetric
		 element mapping */
	      /* Jac=0 */
	      for (i=0; i<dim*dim; i++)
		Jac[i]=0.0;
	
	      /* Jac = sum_{i=nodes} vertex(i)*gradphi1_i^T */
	      for (i=0;i<bas_n1; i++)
		{
		  for (j=0; j<dim; j++)
		    {
		      for (r=0; r<dim; r++)
			{
			  Jac[j*dim+r]+=
			    avertex_loc[j*bas_n2+i]
			    * gradp1[0*bas_n1*dim +i*dim +r];
			}
		    }
		}

	      /* get detJac */
	      detJac=Jac[0]*Jac[3]-Jac[2]*Jac[1];
	
	      /* get Jacinv (here direct) */
	      Jacinv[0]=1.0/detJac*Jac[3];
	      Jacinv[1]=-1.0/detJac*Jac[1];
	      Jacinv[2]=-1.0/detJac*Jac[2];
	      Jacinv[3]=1.0/detJac*Jac[0];


	      /* Jinvgrad1= Jacinv * gradphi1[k,:,:]
		 (=real world gradient T1 = const on each element )
	      */
	      for (i=0; i<dim; i++)
		{
		  for (j=0; j<bas_n1; j++)
		    {
		      Jinvgrad1[j*dim+i]=0.0;
		      for (l=0; l<dim; l++)
			{
			  Jinvgrad1[j*dim+i]+= Jacinv[l*dim+i]
			    * gradp1line[0*bas_n1*dim+j*dim+l];
			}
		    }
		}


	      /* loop over all integration points */
	      for (k=0; k<iform1.num_points; k++)
		{
		  /* Jinvgrad2= Jacinv * gradphi2[k,:,:]
		     (=real world gradient T2 = linear function on
		     each element)
		  */
		  for (i=0; i<dim; i++)
		    {
		      for (j=0; j<bas_n2; j++)
			{
			  Jinvgrad2[j*dim+i]=0.0;
			  for (l=0; l<dim; l++)
			    {
			      Jinvgrad2[j*dim+i]+= Jacinv[l*dim+i]
				* gradp2line[k*bas_n2*dim+j*dim+l];
			    }
			}
		    }

		  /* pressure at the integration point: */
		  elp=0.0;
		  for(s1=0; s1<bas_n1; s1++)
		    elp+=asol_loc[dim*bas_n2 + s1]
		      * phi1line[k*bas_n1+s1];

		
		  /* velocity at the integration point: */
		  for (m=0; m<dim; m++)
		    {
		      elu[m]=0.0;
		      for(s1=0; s1<bas_n2; s1++)
			elu[m]+=asol_loc[m*bas_n2+s1]
			  * phi2line[k*bas_n2+s1];
		    }


		  /* velocity gradient at the integration point: */
		  for (i=0; i<dim; i++)
		    for (m=0; m<dim; m++)
		      {
			elgu[i*dim+m]=0.0;
			for(s1=0; s1<bas_n2; s1++)
			  elgu[i*dim+m]+= Jinvgrad2[s1*dim+m]
			    *asol_loc[i*bas_n2+s1];
		      }
		
		  /* get tangential vector t and vector u at this
		     integration point */
		  for (j=0; j<dim; j++)
		    {
		      elt[j]=avertex_loc[j*bas_n2+1]
			-avertex_loc[j*bas_n2+0];
		    }

		  /* normal vector n*ds= orientation*
		     rotmatrix(pi/2)*t */
		  /* dim==2 only */
		  elnds[0]= -orie*elt[1];
		  elnds[1]=  orie*elt[0];


		  /* now all the incredients are ready, compute the
		     criteria */
		  switch(pctype)
		    {
		    case 1: /* surface force */
		      /* F=int_S[ mu*( (grad u) + (grad u)^T)*n -p*n ]
		       */
		      weight= edge_elw*iform1.weights[k];
		      /* viscous force * weight vector */
		      for (i=0; i<dim; i++)
			{
			  for (j=0; j<dim; j++)
			    {
			      acrit_loc+=weight * nu
				* pcdat[i]
				* (elgu[i*dim+j]+elgu[j*dim+i])
				* elnds[j];
			    }
			}
		      /* pressure force * weight vector */
		      for (i=0; i<dim; i++)
			{
			  acrit_loc-=weight
			    * pcdat[i] * elp * elnds[i];
			}

		      break;
		    default:
		      fprintf(stderr, "navsto_perf_I_t21: "
			      "unknown pcsurf pccrit type!(pccrit %d)\n",
			      (int) pccrit);
		      return FAIL;
		    }
		} /* end loop over all integration points */

	      acrit_loc >>= crit_loc;
	      trace_off();

	      pcvec->V[pccrit]+=crit_loc;

	      if ((dIdsol != NULL)||(dIdF != NULL))
		{ /* need to evaluate the derivatives of the local
		     criterion */
		  double *base_point;
		  double *grad_loc;

		  TRY_MALLOC(base_point, (dim*bas_n2)+(dim*bas_n2+bas_n1),
			     double, navsto_perf_I_t21);
		  TRY_MALLOC(grad_loc, (dim*bas_n2)+(dim*bas_n2+bas_n1),
			     double, navsto_perf_I_t21);
		  
		  /* same order of input as above! */
		  for (i=0; i<dim; i++)
		    for (j=0; j<bas_n2; j++)
		      {
			base_point[i*bas_n2+j] 
			  = msh->vertex[eldofs2[j]*vx_w+MCT2VXSTRT +i];
		      }
		  for (i=0; i<dim; i++)
		    for (j=0; j<bas_n2; j++)
		      {
			base_point[(i+dim)*bas_n2+j] 
			  = sol->V[eldofs2[i*bas_n2+j]];
		      }
		  for (i=0; i<bas_n1; i++)
		    {
		      base_point[(2*dim)*bas_n2+i] 
			= sol->V[elpdofs[i]];
		    }
		  
		  gradient(1,(dim*bas_n2)+(dim*bas_n2+bas_n1),
			   base_point,grad_loc);

		  /* derivatives in same order as input! */
		  for (i=0; i<dim; i++)
		    for (j=0; j<bas_n2; j++)
		      {
			dIdF[pccrit].V[i*vx_nr+eldofs2[j]]
			  +=grad_loc[i*bas_n2+j];
		      }
		  for (i=0; i<dim; i++)
		    for (j=0; j<bas_n2; j++)
		      {
			dIdsol[pccrit].V[eldofs2[i*bas_n2+j]]
			  +=grad_loc[(i+dim)*bas_n2+j];
		      }
		  for (i=0; i<bas_n1; i++)
		    {
		      dIdsol[pccrit].V[elpdofs[i]]
			+=grad_loc[(2*dim)*bas_n2+i];
		    }
		      
		  free(grad_loc);
		  free(base_point);		    
		} /* end AD local derivatives */
		  
	      /* try next element side */
	      edge_elem++;
	    } /* end this element side of the pcsurf */

	} /* end needgrad */

       /* look for next entry with no childs */
       pcsu++;
       while ((pcsu<(*msh).ps_nr)&&
	     ((*msh).edge[(*msh).pcsurf[pcsu*ps_w+MCXXPSSURF]*eg_w
			  +MCT2EGCHL1] !=-1)) pcsu++;

     }/* end loop over all finest level pcsurf */

   /* set max_vol_loops to one, if there are more required they will be
      set by an criterion */
   max_vol_loops = 1;

   /* loop over all pcvol entries, max_vol_loops times */
   for (tloop=0; tloop<max_vol_loops; tloop++)
     for (pcvo=0; pcvo<(*msh).pv_nr; pcvo++)
       {
	el=(*msh).pcvol[pcvo*pv_w+MCXXPVVOLM];

	pccrit=(*msh).pcvol[pcvo*pv_w+MCXXPVCRIT];
	pctype=(FIDX)(*msh).pccrit[pccrit*pc_w+MCXXPCTYPE];
	pcdat =&(*msh).pccrit[pccrit*pc_w+MCXXPCDAT1];

	/* define the local dofs */
	for (i=0; i<dim; i++)
	  for (j=0; j<bas_n1; j++)
	    eldofs1[i*bas_n1+j]=i*vx_nr+(*msh).elem[el*el_w+MCT2ELNOD1+j];
	for (i=0; i<dim; i++)
	  for (j=0; j<bas_n2; j++)
	    eldofs2[i*bas_n2+j]=i*vx_nr+(*msh).elem[el*el_w+MCT2ELNOD1+j];
	for (i=0; i<bas_n1; i++)
	  elpdofs[i]=dim*vx_nr+(*K).pdof[eldofs1[i]];

	trace_on(1);
	/* set the input */
	for (i=0; i<dim; i++)
	  for (j=0; j<bas_n2; j++)
	    {
	      avertex_loc[i*bas_n2+j] 
		<<= msh->vertex[eldofs2[j]*vx_w+MCT2VXSTRT +i];
	    }
	for (i=0; i<dim; i++)
	  for (j=0; j<bas_n2; j++)
	    {
	      asol_loc[i*bas_n2+j] <<= sol->V[eldofs2[i*bas_n2+j]];
	    }
	for (i=0; i<bas_n1; i++)
	  {
	    asol_loc[dim*bas_n2+i] <<= sol->V[elpdofs[i]];
	  }
	acrit_loc = 0.0;

	/* compute the Jacobian of the linear isoparmetric element
	   mapping */
	/* Jac=0 */
	for (i=0; i<dim*dim; i++)
	  Jac[i]=0.0;
	
	/* Jac = sum_{i=nodes} vertex(i)*gradphi1_i^T */
	for (i=0;i<bas_n1; i++)
	  {
	    for (j=0; j<dim; j++)
	      {
		for (r=0; r<dim; r++)
		  {
		    Jac[j*dim+r]+=
		       avertex_loc[j*bas_n2+i]
		      * gradp1[0*bas_n1*dim +i*dim +r];
		  }
	      }
	  }

	/* get detJac */
	detJac=Jac[0]*Jac[3]-Jac[2]*Jac[1];
	
	/* get Jacinv (here direct) */
	Jacinv[0]=1.0/detJac*Jac[3];
	Jacinv[1]=-1.0/detJac*Jac[1];
	Jacinv[2]=-1.0/detJac*Jac[2];
	Jacinv[3]=1.0/detJac*Jac[0];

	/* Jinvgrad1= Jacinv * gradphi1[k,:,:]
	   (=real world gradient T1 = const on each element )
	*/
	for (i=0; i<dim; i++)
	  {
	    for (j=0; j<bas_n1; j++)
	      {
		Jinvgrad1[j*dim+i]=0.0;
		for (l=0; l<dim; l++)
		  {
		    Jinvgrad1[j*dim+i]+= Jacinv[l*dim+i]
		      * gradp1[0*bas_n1*dim+j*dim+l];
		  }
	      }
	  }


	/* loop over all integration points */
	for (k=0; k<iform.num_points; k++)
	  {
	    /* Jinvgrad2= Jacinv * gradphi2[k,:,:]
	       (=real world gradient T2 = linear function on each element)
	    */
	    for (i=0; i<dim; i++)
	      {
		for (j=0; j<bas_n2; j++)
		  {
		    Jinvgrad2[j*dim+i]=0.0;
		    for (l=0; l<dim; l++)
		      {
			Jinvgrad2[j*dim+i]+= Jacinv[l*dim+i]
			  * gradp2[k*bas_n2*dim+j*dim+l];
		      }
		  }
	      }

	    /* pressure at the integration point: */
	    elp=0.0;
	    for(s1=0; s1<bas_n1; s1++)
	      elp+=asol_loc[dim*bas_n2 + s1]
		* phi1[k*bas_n1+s1];


	    /* velocity at the integration point: */
	    for (m=0; m<dim; m++)
	      {
		elu[m]=0.0;
		for(s1=0; s1<bas_n2; s1++)
		  elu[m]+=asol_loc[m*bas_n2 + s1]
		    * phi2[k*bas_n2+s1];
	      }


	    /* velocity gradient at the integration point: */
	    for (i=0; i<dim; i++)
	      for (m=0; m<dim; m++)
		{
		  elgu[i*dim+m]=0.0;
		  for(s1=0; s1<bas_n2; s1++)
		    elgu[i*dim+m]+= Jinvgrad2[s1*dim+m]
		       *asol_loc[i*bas_n2+s1];
		}


	    /* reference solution velocity and pressure */
	    if (ref_sol!=NULL)
	      {
		/* pressure at the integration point: */
		el_ref_p=0.0;
		for(s1=0; s1<bas_n1; s1++)
		  el_ref_p+=(*ref_sol).V[elpdofs[s1]]
		    * phi1[k*bas_n1+s1];
		/* velocity at the integration point: */
		for (m=0; m<dim; m++)
		  {
		    el_ref_u[m]=0.0;
		for(s1=0; s1<bas_n2; s1++)
		  el_ref_u[m]+=(*ref_sol).V[eldofs2[m*bas_n2+s1]]
		    * phi2[k*bas_n2+s1];
		  }
	      }


	    /* now all the incredients are ready, compute the
	       criteria and their derivatives */
	    switch(pctype)
	      {
	      case 2: /* energy dissipation */
		/* y= (grad u)+(grad u)^T;
		   integrate y:y */
		if (tloop==0)
		  {
		    weight=iform.weights[k]*fabs(detJac)*nu/2;
		    for (i=0; i<dim; i++)
		      {
			for (j=0; j<dim; j++)
			  {
			    acrit_loc+=weight
			      * (elgu[i*dim+j]+elgu[j*dim+i])
			      * (elgu[i*dim+j]+elgu[j*dim+i]);
			  }
		      }
		  } /* end if tloop==0 */
		break;
	      case 3: /* mesh volume */
		if (tloop==0)
		  {
		    /* integrate 1 */
		    weight=iform.weights[k]*fabs(detJac);
		    acrit_loc+=weight;
		  } /* end if tloop==0 */		
		break;
	      case 4: /* mean value */
		if (tloop==0) /* first loop, compute mean value */
		  {
		    const FIDX comp=(FIDX) pcdat[0];

		    fprintf(stderr,"navsto_perf_I_t21: hv1 and hv2 not "
			    "done by AD_elem yet!\n"); 
		    return FAIL;

		    weight=iform.weights[k]*fabs(detJac);
		    /* hv1=integral of the solution component */
		    /* if ((comp>=0)&&(comp<dim))
		       hv1_loc+=weight*elu[comp];
		       else if (comp==dim)
		       hv1_loc+=weight*elp;
		       else
		       {
		       fprintf(stderr,"navsto_perf_I_t21: pccrit type 4,"
		       " invalid component!(pccrit %d)\n",
		       (int) pccrit);
		       return FAIL;
		       } */


		    /* hv2=integral over one */
		    /* ahv2_loc+=weight; */
		  } /* end if tloop==0 */
		break;
	      case 5: /* square of deviation from mean */
		if (max_vol_loops<2) max_vol_loops=2;
		if (tloop==0)
		  {
		    const FIDX comp=(FIDX) pcdat[0];

		    fprintf(stderr,"navsto_perf_I_t21: hv1 and hv2 not "
			    "done by AD_elem yet!\n"); 
		    return FAIL;

		    weight=iform.weights[k]*fabs(detJac);
		    /* hv1=integral of the solution component */
		    /* 
		       if ((comp>=0)&&(comp<dim))
		       ahv1_loc+=weight*elu[comp];
		       else if (comp==dim)
		       ahv1_loc+=weight*elp;
		       else
		       {
		       fprintf(stderr,"navsto_perf_I_t21: pccrit type 5,"
		       " invalid component!(pccrit %d)\n",
		       (int) pccrit);
		       return FAIL;
		       } */


		    /* hv2=integral over one */
		    /* ahv2_loc+=weight; */
		  } /* end if tloop==0 */
		else if (tloop==1)
		  {
		    const FIDX comp=(FIDX) pcdat[0];
                     /* have now:
		         hv1=integral of the solution component,
			 hv2=integral over one
		       compute square of mean deviation */
		    /*
		      if ((comp>=0)&&(comp<dim))
		      {
		      adouble ldev=elu[comp]-hv1[pccrit]/hv2[pccrit];
		      
		      weight=iform.weights[k]*fabs(detJac)*pcdat[1];
		      acrit_loc+=weight * ldev*ldev;
		      for (i=0; i<bigN;i++)
		      {
		      dIdsol[pccrit].V[i]+=weight.value() * 2*ldev*
		      (dhv1_dsol[i]
		      }
			
		      }
		      else if (comp==dim)
		      {
		      adouble ldev=elp-hv1[pccrit]/hv2[pccrit];
		      
		      weight=iform.weights[k]*fabs(detJac)*pcdat[1];
		      apcvec[pccrit]+=weight * ldev*ldev;
		      }
		      else
		      {
		      fprintf(stderr,"navsto_perf_I_t21: pccrit type 5,"
		      " invalid component!(pccrit %d)\n",
		      (int) pccrit);
		      return FAIL;
		      }*/
		  } /* end if tloop==1 */
		break;
	      case 6: /* square of deviation from reference solution */
		if (tloop==0)
		  {
		    FIDX comp;
		    const FIDX comp1=(FIDX) pcdat[0];
		    const FIDX comp2=(FIDX) pcdat[1];

		    for (comp=comp1; comp<=comp2; comp++)
		      {
			if ((comp>=0)&&(comp<dim))
			  {
			    adouble ldev=elu[comp]-el_ref_u[comp];

			    weight=iform.weights[k]*fabs(detJac);
			    acrit_loc+=weight * ldev*ldev;
			  }
			else if (comp==dim)
			  {
			    adouble ldev=elp-el_ref_p;

			    weight=iform.weights[k]*fabs(detJac);
			    acrit_loc+=weight * ldev*ldev;
			  }
			else
			  {
			    fprintf(stderr,"navsto_perf_I_t21: pccrit type 6,"
				    " invalid component!(pccrit %d)\n",
				    (int) pccrit);
			    return FAIL;
			  }
		      } /* end loop over components */
		  } /* end if tloop==0 */
		break;
	      default:
		fprintf(stderr,
			"navsto_perf_I_t21: unknown pcvol pccrit type!"
			"(pccrit %d)\n", (int) pccrit);
		return FAIL;
	      }
	  } /* end loop over all integration points */
	
	acrit_loc >>= crit_loc;
	trace_off();

	pcvec->V[pccrit]+=crit_loc;
	
	if ((dIdsol != NULL)||(dIdF != NULL))
	  { /* need to evaluate the derivatives of the local
	       criterion */
	    double *base_point;
	    double *grad_loc;

	    TRY_MALLOC(base_point, (dim*bas_n2)+(dim*bas_n2+bas_n1),
		       double, navsto_perf_I_t21);
	    TRY_MALLOC(grad_loc, (dim*bas_n2)+(dim*bas_n2+bas_n1),
		       double, navsto_perf_I_t21);
		  
	    /* same order of input as above! */
	    for (i=0; i<dim; i++)
	      for (j=0; j<bas_n2; j++)
		{
		  base_point[i*bas_n2+j] 
		    = msh->vertex[eldofs2[j]*vx_w+MCT2VXSTRT +i];
		}
	    for (i=0; i<dim; i++)
	      for (j=0; j<bas_n2; j++)
		{
		  base_point[(i+dim)*bas_n2+j] 
		    = sol->V[eldofs2[i*bas_n2+j]];
		}
	    for (i=0; i<bas_n1; i++)
	      {
		base_point[(2*dim)*bas_n2+i] 
		  = sol->V[elpdofs[i]];
	      }
	    
	    gradient(1,(dim*bas_n2)+(dim*bas_n2+bas_n1),
		     base_point,grad_loc);
	    
	    /* derivatives in same order as input! */
	    for (i=0; i<dim; i++)
	      for (j=0; j<bas_n2; j++)
		{
		  dIdF[pccrit].V[i*vx_nr+eldofs2[j]]
		    +=grad_loc[i*bas_n2+j];
		}
	    for (i=0; i<dim; i++)
	      for (j=0; j<bas_n2; j++)
		{
		  dIdsol[pccrit].V[eldofs2[i*bas_n2+j]]
		    +=grad_loc[(i+dim)*bas_n2+j];
		}
	    for (i=0; i<bas_n1; i++)
	      {
		dIdsol[pccrit].V[elpdofs[i]]
		  +=grad_loc[(2*dim)*bas_n2+i];
	      }
		      
	    free(grad_loc);
	    free(base_point);		    
	  } /* end AD local derivatives */

       } /* end loop over all pcvol entries */

   /* loop over all pccrit entries to finish some stuff */
   for (pccrit=0; pccrit<pc_nr; pccrit++)
     {
       pctype=(FIDX)(*msh).pccrit[pccrit*pc_w+MCXXPCTYPE];
       pcdat =&(*msh).pccrit[pccrit*pc_w+MCXXPCDAT1];

       switch(pctype)
	{
	case 1: /* surface force */
	  break; /* nothing to be done */
	case 2: /* energy dissipation */
	  break; /* nothing to be done */
	case 3: /* mesh volume */
	  break; /* nothing to be done */
	case 4: /* mean value */
	  /* I = c* int(u)/int(1) */
	  printf("int(u)=%e   int(1)=%e\n",hv1[pccrit],hv2[pccrit]);
	  pcvec->V[pccrit]=pcdat[1]*hv1[pccrit]/hv2[pccrit];
	  break;
	case 5: /* square deviation from mean value */
	  break; /* nothing to be done */
	case 6: /* square deviation from reference solution */
	  break; /* nothing to be done */
	default:
	  fprintf(stderr,
		  "navsto_perf_I_t21: unknown pcvol pccrit type!"
		  "(pccrit %d), final corrections\n", (int) pccrit);
	  return FAIL;
	}
     }
 
   /* free local data */

   delete[] avertex_loc;
   delete[] asol_loc;

   free(hv2);
   free(hv1);

   free(el_ref_u);

   free(elpdofs);
   free(eldofs2);
   free(eldofs1);

   free(egpdofs);
   free(egdofs2);
   free(egdofs1);

   delete[] elnds;
   delete[] elt;

   delete[] elgu;
   delete[] elu;

   delete[] Jinvgrad2;
   delete[] Jinvgrad1;

   delete[] Jacinv;
   delete[] Jac;

   free(phi1line);
   free(gradp1line);
   free(hessp1line);
   free(phi2line);
   free(gradp2line);
   free(hessp2line);
   free(pointsline);
   free_intdata (&iform1);
   free_intdata (&iform);

   return SUCCESS;
}


/*FUNCTION*/
int navsto_Psi_tim_Res_t21(double *Psi_tim_Res, double **d_Psi_tim_Res__ds,
			   struct vector *Psi,
			   struct vector *sol,
			   struct navsto_matrix *K,
			   struct mesh *msh
/* evaluates the Psi times the residual, where the residual is defined
    as the finite element discretisation (with P2-P1 triangles) of the
    Navier-Stokes equation

          -nu Laplace(u) + u * grad u + grad p = f
                                         div u = 0

    with boundary conditions as given in K,

    The evaluation of the derivatives via the discrete adjoint method
    requires the evaluation of

      DI   dI         dR
      -- = -- - Psi^T -- .
      DF   dF         dF

    This routine provides the

                Psi^T R

    so that appropriate use of algorithmic differentiation may be used
    to compute the required derivative term (Note that the "-" is omitted!)

            dR
      Psi^T -- .
            dF




    In/Out: Psi     - the adjoint solution vector for the NS-system,
                      it is mostly read only, but the values at
                      dirichlet nodes are set to zero,
		     SO IT IS MODIFIED!
	
    Input:  sol     - the solution vector of the NS-system (where the
                      system is to be linearized)
	   K       - the Navier-Stokes matrix struct, used to get the
	             pressure dofs and boundary conditions
	   m       - the mesh, the shape defining node positions are
	             marked in there as well

    Output: Psi_tim_Res
                    - Psi times the residual,
	    d_Psi_tim_Res__ds
                    - the derivative of Psi_tim_Res wrt. the vertex
		      positions, pointer to double provided by
		      reverence, new array of size vx_nr*vx_w is
		      created

    Return: SUCCESS - success
            FAIL    - failure, see error message, output will not be
                      valid
*/
/*   AD:

 Psi_tim_Res = abhaengige (nur eine!)

 sowie d Psi_tim_Res/d vertex 
*/

		    ){
   FIDX el, i, j, k, l, m, n, r, s1, t;
   int  err;

   FIDX dim=2, bas_n1, bas_n2, bas_n2_2, bas_n2_3, vx_nr, eg_nr, hi_nr,
     bd_nr, el_w, vx_w, eg_w, fc_w, bd_w, hi_w, fu_w,
     bigN;
   FIDX subtypes[2];

   struct int_data iform; /* integration formula 2d   */

   adouble *elResv;       /* element residual velocity comps */
   adouble *elResp;       /* element residual pressure comps */

   adouble *Jac, *Jacinv;  /* Jacobian of the element mapping and its
			    inverse */
   adouble detJac;         /* determinant of the Jacobian */
   adouble weight;         /* weight of the int. point times |detJac| */


   adouble *Jinvgrad1;     /* inverse Jacobian times gradphi1 */
   adouble *Jinvgrad2;     /* inverse Jacobian times gradphi2 */

   double elp;            /* pressure at the integration point */
   double *elu;           /* velocity at the integration point */
   adouble *elgu;          /* velocity gradient at the integration point */

   FIDX    *dofs1, *dofs2, *elpdofs;
                          /* degrees of freedom to which the local
			    matrices correspond */
   double nu;             /* nondimensional parameter describing the
			    flow, nu= 1/Re, where Re is the Reynolds
			    number, thus nu=mu/(rho*U*d), where mu is
			    the viscosity of the fluid, rho the
			    density, U is the velocity scale (the
			    solution velocities relate to this scale),
			    d is the lenght scale (the scale by which
			    lengths in the mesh are given),
			    nu is taken from m.param[MC2XPANUPO] */

   FIDX pvx_nr;           /* number of pressure dofs */
   FIDX *pdofs;           /* vector specifying the dof for a pressure
			    node */

   /* some helpers to work out the derivative of the last row */
   double *sumpdwdF;      /* sum of pressure times deriv of weight */
   double sumw;           /* sum of weights */


   double *phi1, *phi2, *gradp1, *gradp2, *hessp2;

   /**************             AD     ********************************/

   adouble *avertex_loc;
   adouble aPsi_tim_Res_loc;
   double  Psi_tim_Res_loc;


   /****************   init ******************************************/
   /* get integration formula */
   subtypes[0]=1;
   subtypes[1]=2;

   /* for t21 with linear isoparametric mappings only degree 5
      necessary, but not available yet ==> use degree 7 */
   err=cubature_bases( dim, 7, tria, 2, subtypes, &iform);
   FUNCTION_FAILURE_HANDLE( err, cubature_bases, navsto_Psi_tim_Res_t21);

   /* make phi and gradphi better accessible */
   phi1   = (iform.bases[0]->phi);
   gradp1 = (iform.bases[0]->gradphi);
   bas_n1 = (iform.bases[0]->num_basis);
   phi2   = (iform.bases[1]->phi);
   gradp2 = (iform.bases[1]->gradphi);
   hessp2 = (iform.bases[1]->hessphi);
   bas_n2 = (iform.bases[1]->num_basis);

   vx_nr = (*msh).vx_nr;
   eg_nr = (*msh).eg_nr;
   hi_nr = (*msh).hi_nr;
   bd_nr = (*msh).bd_nr;
   vx_w  = (*msh).vx_w;
   el_w  = (*msh).el_w;
   eg_w  = (*msh).eg_w;
   fc_w  = (*msh).fc_w;
   bd_w  = (*msh).bd_w;
   hi_w  = (*msh).hi_w;
   fu_w  = (*msh).fu_w;

   nu    = (*msh).para[MC2XPANUPO];
   pdofs = (*K).pdof;
   pvx_nr= (*K).pvx_nr;
   bigN  = dim*vx_nr+pvx_nr;

   if (dim > 2)
     {
       /* cry */
       fprintf(stderr,
	      "navsto_Psi_tim_Res_t21: dim >2 not implemented (Jacinv)\n");
       return FAIL;
     }

   if (bas_n1>bas_n2)
     {
       /* cry */
       fprintf(stderr,
	      "navsto_Psi_tim_Res_t21: bas_n1>bas_n2 ???????\n");
       return FAIL;
     }

   if ( Psi->len != bigN )
     {
       fprintf(stderr,
	      "navsto_Psi_tim_Res_t21: Psi has wrong size\n");
       return FAIL;
     }
   if ( sol->len != bigN )
     {
       fprintf(stderr,
	      "navsto_Psi_tim_Res_t21: sol has wrong size\n");
       return FAIL;
     }


   /* further init */
   bas_n2_2=bas_n2*bas_n2;    /* bas_n2^2 */
   bas_n2_3=bas_n2*bas_n2_2;  /* bas_n2^3 */

   /* allocate memory for the matrices/vectors on the element */
/*    TRY_MALLOC( Jac, dim*dim, double, navsto_Psi_tim_Res_t21); */
/*    TRY_MALLOC( Jacinv, dim*dim, double, navsto_Psi_tim_Res_t21); */

   Jac = new adouble[dim*dim];
   Jacinv = new adouble[dim*dim];

/*    TRY_MALLOC( Jinvgrad1, dim*bas_n1, double, navsto_Psi_tim_Res_t21); */
/*    TRY_MALLOC( Jinvgrad2, dim*bas_n2, double, navsto_Psi_tim_Res_t21); */

   Jinvgrad1 = new adouble[dim*bas_n1];
   Jinvgrad2 = new adouble[dim*bas_n2];

   TRY_MALLOC( elu, dim, double, navsto_Psi_tim_Res_t21);
/*    TRY_MALLOC( elgu, dim*bas_n2, double, navsto_Psi_tim_Res_t21); */

   elgu = new adouble[dim*bas_n2];
   
/*    TRY_MALLOC( elResv, dim*bas_n2, double, navsto_Psi_tim_Res_t21); */
/*    TRY_MALLOC( elResp, bas_n1, double, navsto_Psi_tim_Res_t21); */
   elResv = new adouble[dim*bas_n2];
   elResp = new adouble[bas_n1];


   TRY_MALLOC( dofs1, dim*bas_n1, FIDX, navsto_Psi_tim_Res_t21);
   TRY_MALLOC( dofs2, dim*bas_n2, FIDX, navsto_Psi_tim_Res_t21);
   TRY_MALLOC( elpdofs, bas_n1, FIDX, navsto_Psi_tim_Res_t21);

   /**************             AD     ********************************/

   avertex_loc = new adouble[dim*bas_n1];


   if (d_Psi_tim_Res__ds!=NULL)
     {
       *d_Psi_tim_Res__ds = new double[vx_nr*vx_w];

       for (i=0; i<vx_nr*vx_w; i++)
	 {
	   (*d_Psi_tim_Res__ds)[i] = 0.0;
	 }
     }
   
   /**************             AD     ********************************/


   /* loop over all elements */
   for (el=0; el<(*msh).el_nr; el++)
     {
       /* define the loval dofs */
       for (i=0; i<dim; i++)
	for (j=0; j<bas_n1; j++)
	  dofs1[i*bas_n1+j]=i*vx_nr+(*msh).elem[el*el_w+MCT2ELNOD1+j];
       for (i=0; i<dim; i++)
	 for (j=0; j<bas_n2; j++)
	   dofs2[i*bas_n2+j]=i*vx_nr+(*msh).elem[el*el_w+MCT2ELNOD1+j];
       for (i=0; i<bas_n1; i++)
	 elpdofs[i]=dim*vx_nr+pdofs[dofs1[i]];

       trace_on(2);
       /* set the input */
       for (i=0; i<dim; i++)
	 for (j=0; j<bas_n1; j++)
	   {
	     avertex_loc[i*bas_n1+j] 
	       <<= msh->vertex[dofs1[j]*vx_w+MCT2VXSTRT +i];
	   }
       

       /* set elResv to zero */
       for (i=0; i<dim*bas_n2; i++)
	{
	  elResv[i]=0.0;
	}
       /* set elResp to zero */
       for (i=0; i<bas_n1; i++)
	{
	  elResp[i]=0.0;
	}

       /* compute the Jacobian of the linear isoparmetric element
	 mapping */
       /* Jac=0 */
       for (i=0; i<dim*dim; i++)
	Jac[i]=0.0;
	
       /* Jac = sum_{i=nodes} vertex(i)*gradphi1_i^T */
       for (i=0;i<bas_n1; i++)
	{
	  for (j=0; j<dim; j++)
	    {
	      for (r=0; r<dim; r++)
		{
		  Jac[j*dim+r]+=
		    avertex_loc[j*bas_n1+i]
		    * gradp1[0*bas_n1*dim +i*dim +r];
		}
	    }
	}

       /* get detJac */
       detJac=Jac[0]*Jac[3]-Jac[2]*Jac[1];
	
       /* get Jacinv (here direct) */
       Jacinv[0]=1.0/detJac*Jac[3];
       Jacinv[1]=-1.0/detJac*Jac[1];
       Jacinv[2]=-1.0/detJac*Jac[2];
       Jacinv[3]=1.0/detJac*Jac[0];

       /* Jinvgrad1= Jacinv * gradphi1[k,:,:]
	 (=real world gradient T1 = const on each element )
       */
       for (i=0; i<dim; i++)
	{
	  for (j=0; j<bas_n1; j++)
	    {
	      Jinvgrad1[j*dim+i]=0.0;
	      for (l=0; l<dim; l++)
		{
		  Jinvgrad1[j*dim+i]+= Jacinv[l*dim+i]
		    * gradp1[0*bas_n1*dim+j*dim+l];
		}
	    }
	}


       /* loop over all integration points */
       for (k=0; k<iform.num_points; k++)
	{
	  /* Jinvgrad2= Jacinv * gradphi2[k,:,:]
	     (=real world gradient T2 = linear function on each element)
	  */
	  for (i=0; i<dim; i++)
	    {
	      for (j=0; j<bas_n2; j++)
		{
		  Jinvgrad2[j*dim+i]=0.0;
		  for (l=0; l<dim; l++)
		    {
		      Jinvgrad2[j*dim+i]+= Jacinv[l*dim+i]
			* gradp2[k*bas_n2*dim+j*dim+l];
		    }
		}
	    }


	  /* pressure at the integration point: */
	  elp=0.0;
	  for(s1=0; s1<bas_n1; s1++)
	    elp+=(*sol).V[elpdofs[s1]]
	      * phi1[k*bas_n1+s1];


	  /* velocity at the integration point: */
	  for (m=0; m<dim; m++)
	    {
	      elu[m]=0.0;
	      for(s1=0; s1<bas_n2; s1++)
		elu[m]+=(*sol).V[dofs2[m*bas_n2+s1]]
		  * phi2[k*bas_n2+s1];
	    }


	  /* velocity gradient at the integration point: */
	  for (i=0; i<dim; i++)
	    for (m=0; m<dim; m++)
	      {
		elgu[i*dim+m]=0.0;
		for(s1=0; s1<bas_n2; s1++)
		  elgu[i*dim+m]+= Jinvgrad2[s1*dim+m]
		    *(*sol).V[dofs2[i*bas_n2+s1]];
	      }


	  /* now all the incredients are ready, compute the
	     local residual */

	  /* velocity components */
	  for (i=0; i<dim; i++)
	    for (j=0; j<bas_n2; j++)
	      {
		/* the a(u,v[i,j]) (viscous) part */
		/* nu*weight*|det(Jac)| of this integration point */
		weight=nu*iform.weights[k] * fabs(detJac);
		for (s1=0; s1<dim; s1++)
		  {
		    elResv[i*bas_n2+j] +=
		      weight *(+elgu[i*dim+s1]
			       * Jinvgrad2[j*dim+s1] );
		  }

		/* the c(u,u,v[i,j]) (trilinear) part */
		weight=iform.weights[k] * fabs(detJac);
		for (s1=0; s1<dim; s1++)
		  {
		    elResv[i*bas_n2+j] +=
		      weight * phi2[k*bas_n2+j] * elu[s1] *
		      elgu[i*dim+s1];
		  }

		/* the b(v[i,j],p) (pressure) part */
		/* pressure*weight of this integration point */
		elResv[i*bas_n2+j] -=
		  Jinvgrad2[j*dim+i]*elp*iform.weights[k]* fabs(detJac);
	      }

	  /* pressure components */
	  for (j=0; j<bas_n1; j++)
	    {
	      /* the b(u,q[j]) (divergence) part */
	      /* weight*q[j]*|det(Jac)| of this integration point */
	      weight=phi1[k*bas_n1+j]*iform.weights[k]* fabs(detJac);
	      for (s1=0; s1<dim; s1++)
		{
		  elResp[j] -= weight * elgu[s1*dim+s1];
		}
	    }


#warning navsto_Psi_tim_Res_t21: no rhs assembly yet!
	} /* end loop over all integration points */

       /***************************************************************
        * the element residual is ready, now compute the  local       *
        * contribution to Psi_tim_Res                                 *
        ***************************************************************/

       /* set Psi_tim_Res_loc to zero */
       aPsi_tim_Res_loc = 0.0;

       /* velocity-space residual */
       for (r=0; r<dim; r++) /* r-th dimension vel resisual */
	for (j=0; j<bas_n2; j++) /* j-th row of the vel res of this d */
	  {
	    aPsi_tim_Res_loc +=
	      Psi->V[r*vx_nr+dofs2[j]] * elResv[r*bas_n2 + j];
	    //printf("AD: Res(%2d)=%+10.7f\n",r*vx_nr+dofs2[j],elResv[r*bas_n2 + j].value());
	  }

       /* pressure-space residual */
       for (j=0; j<bas_n1; j++) /* j-th row of the p-res */
	{
	  aPsi_tim_Res_loc +=
	    Psi->V[elpdofs[j]] * elResp[j];
	}


       aPsi_tim_Res_loc >>= Psi_tim_Res_loc;
       trace_off();
       *Psi_tim_Res += Psi_tim_Res_loc;

       	if (d_Psi_tim_Res__ds!=NULL)
	  { /* need to evaluate the derivatives of the local
	       Psi*Res */
	    double *base_point;
	    double *grad_loc;

	    TRY_MALLOC(base_point, dim*bas_n1,
		       double, navsto_Psi_tim_Res_t21);
	    TRY_MALLOC(grad_loc,   dim*bas_n1,
		       double, navsto_Psi_tim_Res_t21);
		  
	    /* same order of input as above! */
	    for (i=0; i<dim; i++)
	      for (j=0; j<bas_n1; j++)
		{
		  base_point[i*bas_n1+j] 
		    = msh->vertex[dofs1[j]*vx_w+MCT2VXSTRT +i];
		}
	    
	    gradient(2,dim*bas_n1,
		     base_point,grad_loc);
	    
	    /* derivatives in same order as input! */
	    for (i=0; i<dim; i++)
	      for (j=0; j<bas_n1; j++)
		{
		  (*d_Psi_tim_Res__ds)[dofs1[j]*vx_w+i] 
		    +=grad_loc[i*bas_n1+j];
		}
		      
	    free(grad_loc);
	    free(base_point);		    
	  } /* end AD local derivatives */


     } /* end loop over all elements */


   /* free local data */

   delete[] avertex_loc;

   free(elpdofs);
   free(dofs2);
   free(dofs1);

   delete[] elResp;
   delete[] elResv;

   delete[] elgu;
   free(elu);

   delete[] Jinvgrad2;
   delete[] Jinvgrad1;

   delete[] Jacinv;
   delete[] Jac;

   free_intdata (&iform);

   return SUCCESS;
}


/*FUNCTION*/
int navsto_deriv_compare_print(int i, int j, double A, char *name_A,
			       double B, char *name_B, double tol
/*  prints the integers i,j and the values of A and B and the
    relative difference (A-B)/A, if the relative difference is >=tol

    Input: i       - first integer to be printed
	   j       - second integer to be printed
	   A       - value of A
	   name_A  - name printed in front of A
	   B       - value of B
	   name_B  - name printed in front of B
	   tol     - tolerance, only if relative difference is greater
	             than tol the line will be printed

    Output: (printed)

    Return: SUCCESS - success
            FAIL    - failure, see error message, output will not be
                      valid
*/
			       ){
  double rel_diff=(A-B)/A;

  if ((fabs(rel_diff)>=tol)||(isinf(rel_diff))) //||isnan(rel_diff)))
    {
      /* printf("%3d %3d %s=%+9.7f %s=%+9.7f rel_diff=%+8.2e\n", 
	 i, j, name_A, A, name_B, B, rel_diff); /* */
      printf("%3d %3d %s=%+14.7e %s=%+14.7e rel_diff=%+8.2e\n", 
	     i, j, name_A, A, name_B, B, rel_diff);
    }

  return SUCCESS;
}
