/***********************************************************************

    This file is part of
    FEINS, Finite Element Incompressible Navier-Stokes solver,
    which is expanding to a more general FEM solver and toolbox,
    Copyright (C) 2003--2008, Rene Schneider 
    <rene.schneider@mathematik.tu-chemnitz.de>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program. If not, see <http://www.gnu.org/licenses/>.

    Minor contributions to this program (for example bug-fixes and
    minor extensions) by third parties automatically transfer the
    copyright to the general author of FEINS, to maintain the
    possibility of commercial re-licensing. If you contribute but wish
    to keep the copyright of your contribution, make that clear in
    your contribution!

    Non-GPL licenses to this program are available upon request from
    the author.

************************************************************************/
/*
FILE navstoassem.c
HEADER navstoassem.h

TO_HEADER:


// useful text macros
#include "feins_macros.h"
// data structures 
#include "datastruc.h"
#include "sparse_struct.h"
#include "navsto_struct.h"

*/



/* function prototypes */
#include "navstoassem.h"

/* prototypes of external functions */
#include <math.h>
#include "elements.h"
#include "cubature.h"
#include "sparse.h"
#include "mesh.h"
#include "navsto_aux.h"
#include "feins_lapack.h"    /* link against LAPACK! */
#include "stokes_aux.h"
#include "assembly.h"
#include "stokesassem.h"
#include "lin_solver.h"

#define CBLOCKDENSITY (2*32)
#define ABLOCKDENSITY (12) 
/* #define CBLOCKDENSITY (2*26)
   #define ABLOCKDENSITY (9) */
#define KBLOCKDENSITY (2*(5*5)+1*(3*3))

/*FUNCTION*/
int navsto_C_assem_t21(struct navsto_matrix *K, struct vector *rhs,
		       double *maxPe, struct vector *rhs_stokes,
		       struct vector *u_old,
		       struct mesh *m,
		       FIDX lvl, int Fsonly, 
		       int lintype
/* performs the assembly of the changing part Fs of the stiffness
   matrix K and the right hand side vector rhs which result from
   linearised finite element discretisation of the Navier Stokes
   equation on the T2 mesh m, such that 

           K x = rhs

   defines the new iterate (approximate) solution x=(u_x, u_y, p) of
   the Navier Stokes equation

         -nu Laplace(u) + u * grad u + grad p = f
                                        div u = 0

   two different linearisations are implemented here, (see Gunzburger,
   Finite Element Methods for Viscous Incompressible Flows, Chapter
   6.1 and 6.3) Newtons Method and the Picard (/Simple) Iteration

   with boundary conditions as given in the mesh m    
   (p2 triangles)

   Input:  rhs_stokes
                   - the righthand side of the Stokes system, as
                     obtained by navsto_MG_init_assem_t21
	   u_old   - the previous iterate solution
	   m       - the mesh
	   lvl     - the level who's Fs is written
	   Fsonly  - to indicate if (Fsonly==1) only the matrix is to
	             be assembled, or (Fsonly==0) the righthand side
	             as well
	   lintype - the type of linearisation for which Fs and the
        	     rhs are wanted, lintype==1 ==> Newtons Method,
		     lintype==2 ==> Picard iteration
           
   In/Out: K       - stiffness matrix in specialised storage, the
                     structure K has to be initialised by
                     navsto_MG_init_assem_t21 once, and afterwards it is
                     just updated (the Fs part, which is the only part
                     that changes between iterations) in this routine

   Output: rhs     - righthand side vector, has to be initialised by
                     navsto_MG_init_assem_t21 once, and afterwards it
                     is just updated in this routine
	   maxPe   - max element Peclet number Pe (maxPe>1 means the
 	             FE discretisation on this mesh is unstable unless
 	             stabelised)


   Return: SUCCESS - success
           FAIL    - failure, see error message, output will not be
                     valid
*/
		     ){
  FIDX el, d, i, j, k, l, r, line, col;
  int  err;

  /*
	   stab    - to indicate wether stabilization shall be used
	             for the advection term or not, stab==0: use no
	             stabilisation, stab==1: use stabilisation
  */
  int stab;             /* determins whether SUPG terms are used */
  int artvis;           /* determins whether artificial diffussion is
			   used */ 

  FIDX dim, bas_n1, bas_n2, bas_n2_2, bas_n2_3, vx_nr, eg_nr, hi_nr,  
    el_w, vx_w, eg_w, fc_w, bd_w, hi_w, fu_w,
    bigN;
  FIDX subtypes[2];

  /* for FORTRAN calls: */
  char fTrans='T', fNoTrans='N';
  double dhelp, done=1.0, dzero=0.0;
  int fdim, fbas_n1, fbas_n2; 


  struct sparse *Fs;     /* pointer to this levels Advection Diffusion
			    stiffness matrix */
  struct int_data iform; /* integration formula 2d   */
  double *Uk;            /* velocity vector at the integration point */
  double *elA;           /* element viscous term a(.,.) */
  double *elCm;          /* element trilinear term c(.,.,.) */
  double *elCp;          /* element pressure space advection term */
  double *addC;          /* bas_n2*bas_n2 block to be added to a part of
			    Cw resp. Cu */
  double *Jac, *Jacinv;  /* Jacobian of the element mapping and its
			    inverse */
  double *Jinvgrad1;     /* inverse Jacobian times gradphi1 */
  double *Jinvgrad2;
#if (STABTYPE == 6)
  double *Lphiloc2;      /* Lphiloc2[i] local Hessian of basis function
			    phi2[i]  */
  double *elSGstab;      /* element subgrid stabilisation terms */
#endif

  double detJac;         /* determinant of the Jacobian */
  FIDX   *dofs1, *dofs2; /* degrees of freedom to which addC
			    corresponds */
  FIDX   *elpdofs;       /* element presure degrees of freedom  */

  double *phi1, *gradp1;
  double *phi2, *gradp2, *hphi2;
  double nu, el_nu, ud, weight;

  double h, uc[2], dui_dxj[2*2], abs_uc, heg, h_max, huc, hucmax, pe, beta,
    wrglob, norm_guc;
                         /* values used to determine the mesh peclet
			    number and the stabilization term */ 
  double xc[2];          /* the center point of the element */
  double *phic, *gphic, *hphic;
                         /* the basis function values at this point */

  FIDX count_Pe_stab=0;
  FIDX count_Ga_stab=0;
  double max_Pe_nu=0.0;
  double max_Ga_nu=0.0;

#ifdef TIMETERM
  const double rec_tau=1.0/1.0e+1; /* the reciprocal of the time step */
  double *elM;           /* element time term 1/tau(.,.) */
#endif


  /****************   init ******************************************/
  dim   = (*m).dim;
  vx_nr = (*m).vx_nr;
  eg_nr = (*m).eg_nr;
  hi_nr = (*m).hi_nr;
  vx_w  = (*m).vx_w;
  el_w  = (*m).el_w;
  eg_w  = (*m).eg_w;
  fc_w  = (*m).fc_w;
  bd_w  = (*m).bd_w;
  hi_w  = (*m).hi_w;
  fu_w  = (*m).fu_w;

  nu= (*m).para[MC2XPANUPO];

  /* get integration formula */
  subtypes[0]=1;
  subtypes[1]=2;

  stab=0;
  artvis=0;

  /* the coarse grids of multigrid use linear elements (on the
     velocity space) */
  if (Fsonly==1)
    {
#if ((STABTYPE==0)||(STABTYPE==1))
      /* coarse meshes 2nd order, no stabilisation term */
      stab=0;
      artvis=0;
#elif (STABTYPE==5)
      /* coarse meshes 2nd order, use (local) artificial viscosity */
      stab=0;       /* do not use SUPG terms */
      artvis=1;     /* do use artificial viscosity */
#elif ((STABTYPE==2)||(STABTYPE==6))
      /* coarse meshes 2nd order, but use stabilisation term */
      stab=1;
      artvis=0;
#elif ((STABTYPE==3)||(STABTYPE==4))
      /* coarse meshes 1st order (submesh of 2nd order), use
	 stabilisation term */
      FIDX *nodes, lvx_nr;

      stab=1;
      artvis=0;

      subtypes[1]=1;

      
      /* find the number of pnodes */
      TRY_MALLOC( nodes, vx_nr, FIDX, navsto_C_assem_t21);
      /* mark all as non-linear */
      for (i=0; i<vx_nr; i++) nodes[i]=0; 
      /* run over all edges, mark the linear nodes */
      for (i=0; i<eg_nr; i++)
	{ 
	  FIDX node1, node2;
	  node1= (*m).edge[i*eg_w+MCT2EGNOD1  ];
	  node2= (*m).edge[i*eg_w+MCT2EGNOD1+1];
	  nodes[node1]=1;
	  nodes[node2]=1;
	}
      /* count them */
      lvx_nr=0;
      for (i=0; i<vx_nr; i++) lvx_nr+=nodes[i]; 

      /* check that no nodes with number>lvx_nr higher nodes are
	 linear nodes */
      for (i=lvx_nr; i<vx_nr; i++) 
	if (nodes[i]!=0) 
	  {
	    fprintf(stderr, "navsto_C_assem_t21: node numbering "
		    "not allowable for this implementation of\n"
		    "the coarse grid stabilization\n");
	    return FAIL;
	  }

      free(nodes);
      vx_nr=lvx_nr;

#if (STABTYPE==4)
      /* use only advection diffusion on coarse meshes */
      lintype=2;
#endif
#else
#error "STABTYPE unknown"
#endif


    }

  err=cubature_bases( dim, 7, tria, 2, subtypes, &iform); 
  FUNCTION_FAILURE_HANDLE( err, cubature_bases, navsto_C_assem_t21);

  /* make phi2 and gradphi2 better accessible */
  phi1    = (iform.bases[0]->phi);
  gradp1  = (iform.bases[0]->gradphi);
  bas_n1  = (iform.bases[0]->num_basis);
  phi2    = (iform.bases[1]->phi);
  gradp2  = (iform.bases[1]->gradphi);
  hphi2   = (iform.bases[1]->hessphi);
  bas_n2  = (iform.bases[1]->num_basis);

  fbas_n1 = (int) bas_n1;
  fbas_n2 = (int) bas_n2;
  fdim    = (int) dim;

  xc[0]=1.0/3.0;  /* the center of gravity of the master element is
		     (1/3,1/3) */
  xc[1]=xc[0];
  /* evaluate the basis functions for this point */
  err=eval_basis( dim, tria, subtypes[1], 1, xc, &i, &phic, &gphic, &hphic);
  FUNCTION_FAILURE_HANDLE( err, eval_basis, navsto_C_assem_t21);
  /* free unneeded info */
  free(hphic);


  if (dim > 2)
    {
      /* cry */
      fprintf(stderr,
	      "navsto_C_assem_t21: dim >2 not implemented (Jacinv)\n");
      return FAIL;
    }

  if ((lintype!=1)&&(lintype!=2))
    {
      /* cry */
      fprintf(stderr,
	      "navsto_C_assem_t21: unknown lintype=%d\n", lintype);
      return FAIL;
    }

  if ( (*K).Fs[lvl].row_nr!=dim*vx_nr )
    {
      /* cry */
      fprintf(stderr,
	      "navsto_C_assem_t21: size of Fs wrong???\n"
	      "(*K).Fs[lvl].row_nr=%d    dim*vx_nr=%d\n",
	      (*K).Fs[lvl].row_nr, dim*vx_nr );
      return FAIL; 
    }
    

  bas_n2_2=bas_n2*bas_n2;    /* bas_n2^2 */
  bas_n2_3=bas_n2*bas_n2_2;  /* bas_n2^3 */

  /* allocate memory for local data */
  TRY_MALLOC( Uk, dim, double, navsto_C_assem_t21);
  TRY_MALLOC( elA, bas_n2_2, double, navsto_C_assem_t21);
  TRY_MALLOC( elCm, dim*bas_n2_3, double, navsto_C_assem_t21);
  TRY_MALLOC( addC, bas_n2_2, double, navsto_C_assem_t21);
  TRY_MALLOC( elCp, bas_n1*bas_n1, double, navsto_C_assem_t21);

  TRY_MALLOC( Jac, dim*dim, double, navsto_C_assem_t21);
  TRY_MALLOC( Jacinv, dim*dim, double, navsto_C_assem_t21);

  TRY_MALLOC( Jinvgrad1, dim*bas_n1, double, navsto_C_assem_t21);
  TRY_MALLOC( Jinvgrad2, dim*bas_n2, double, navsto_C_assem_t21);
#if (STABTYPE == 6)
  TRY_MALLOC( Lphiloc2, bas_n2, double, navsto_C_assem_t21);
  TRY_MALLOC( elSGstab, dim*dim*bas_n2_2, double, navsto_C_assem_t21);
#endif

  TRY_MALLOC( dofs1, bas_n2, FIDX, navsto_C_assem_t21);
  TRY_MALLOC( dofs2, bas_n2, FIDX, navsto_C_assem_t21);
  TRY_MALLOC( elpdofs, bas_n1, FIDX, navsto_C_assem_t21);

#ifdef TIMETERM
  TRY_MALLOC( elM, bas_n2_2, double, navsto_C_assem_t21);
#endif

  /* set Fs as pointer to the right level's Fs */
  Fs=&(*K).Fs[lvl];

  /* clear Fs */
  sparse_empty(Fs);

  if (Fsonly!=1)
    {
      /* clear Cp */
      sparse_empty(&(*K).Cp);

      bigN = dim*vx_nr + (*K).pvx_nr;
      /* initialise rhs */
      for (i=0; i<bigN; i++)
	{
	  (*rhs).V[i]=(*rhs_stokes).V[i];
	}
    }

  *maxPe=0.0;
  wrglob=2.0;


  /* loop over all elements */
  for (el=0; el<(*m).el_nr; el++)
    {

      /* compute the Jacobian for this element, 
         we only allow linear triangles */
      /* Jac=0 */
      for (i=0; i<dim*dim; i++)
	Jac[i]=0.0;
	  
      /* Jac = sum_{i=nodes} vertex(i)*gradphi1_i^T */
      for (i=0;i<bas_n1; i++)
	{
	  for (j=0; j<dim; j++)
	    {
	      for (r=0; r<dim; r++)
		{
		  Jac[j*dim+r]+= 
		    (*m).vertex[(*m).elem[el*el_w+MCT2ELNOD1+i]*vx_w
				+MCT2VXSTRT+j]
		    * gradp1[0*bas_n1*dim +i*dim +r];
		}
	    }
	}

      /* get detJac */
      detJac=Jac[0]*Jac[3]-Jac[2]*Jac[1];
	  
      /* get Jacinv (here direct) */
      Jacinv[0]=1.0/detJac*Jac[3];
      Jacinv[1]=-1.0/detJac*Jac[1];
      Jacinv[2]=-1.0/detJac*Jac[2];
      Jacinv[3]=1.0/detJac*Jac[0];


      /* Jinvgrad1= Jacinv * gradphi1[k,:,:] 
	 (=real world gradient T1)
      */
      dgemm_( &fNoTrans, &fNoTrans, &fdim, &fbas_n1, &fdim,
	      &done, Jacinv, &fdim, &(gradp1[0*bas_n1*dim]), &fdim,
	      &dzero, Jinvgrad1, &fdim );


      /* if one of the stabelisation methods is to be used */
      /* if ((stab==1)||(artvis==1)) /* */
	{
	  /* determine the mesh peclet number for this element */
	  /* first get uc, the velocity vector at the center of the
	     element */
	  for (j=0; j<dim; j++)
	    {
	      uc[j]=0.0;
	      for (i=0; i<bas_n2; i++)
		uc[j]+=phic[i]
		  *(*u_old).V[(*m).elem[el*el_w+MCT2ELNOD1+i]+j*vx_nr];
	    }
	  abs_uc=0;
	  for (j=0; j<dim; j++)
	    abs_uc+=uc[j]*uc[j];
	  abs_uc=sqrt(abs_uc);

	  /* get norm(guc), the maximum norm of the Jacobian of the
	     velocity vector at the center of the element */
	  norm_guc=0.0;
	  for (j=0; j<dim; j++)
	    for (r=0; r<dim; r++)
	      {
		double guc_jr=0.0;

		for (i=0; i<bas_n1; i++)
		  guc_jr += Jinvgrad1[i*dim+r]
		    *(*u_old).V[(*m).elem[el*el_w+MCT2ELNOD1+i]+j*vx_nr];
		
		norm_guc=fmax(fabs(guc_jr),norm_guc);
	      }

	  /* now get the length of the edge which coincides best with the
	     direction of uc */
	  hucmax=-1.0;
	  h_max=-1.0;
	  for (i=0; i<bas_n1; i++)
	    for (j=i+1; j<bas_n1; j++)
	      {
		FIDX nodei, nodej;
		nodei=(*m).elem[el*el_w+MCT2ELNOD1+i];
		nodej=(*m).elem[el*el_w+MCT2ELNOD1+j];
		huc=0.0;
		for(d=0; d<dim; d++)
		  huc+=uc[d]*( (*m).vertex[nodei*vx_w+MCT2VXSTRT+d]
			       -(*m).vertex[nodej*vx_w+MCT2VXSTRT+d] );
		
		heg=0.0; /* h_edge */
		for(d=0; d<dim; d++)
		  heg+=( (*m).vertex[nodei*vx_w+MCT2VXSTRT+d]
			 -(*m).vertex[nodej*vx_w+MCT2VXSTRT+d] )
		    *( (*m).vertex[nodei*vx_w+MCT2VXSTRT+d]
		       -(*m).vertex[nodej*vx_w+MCT2VXSTRT+d] );
		heg=sqrt(heg);
		
		if (heg>h_max) h_max=heg; /* max(h_edge) */

		huc=fabs(huc);
		if (huc>hucmax)         /* max(h_edge) in direction of u */
		  {
		    hucmax=huc;
		    h=heg;
		  }
	      }
	  /* this elements Peclet number Pe */
	  pe=0.5*h*abs_uc/nu;
	  /* update the maximum Pe */
	  if (*maxPe<pe) *maxPe=pe;
	  /* stabilization parameter beta */
	  if (abs_uc!=0.0)
	    {
#ifdef RAMAGE_NOT_TEZDUYAR
	      /* Ramage stab parameter */
	      beta=(0.5*h-nu)/abs_uc;
#else
	      /* Tezduyar stab parameter */
	      beta=(0.5*h/abs_uc)*fmin(1.0, 0.33333333*pe);
#endif


	    }
	  else
	    {
	      beta=0.0;
	    }
	  if ((beta<0.0)||(stab!=1)) beta=0.0;

	  el_nu=nu;
	  if (artvis==1)
	    {
	      double stab_nu;
	      double stab_alpha=0.125;

	      /* desired viscosity to get Pe<=2 */
	      stab_nu=stab_alpha*h*abs_uc;
	      if (el_nu<stab_nu)
		{
		  el_nu=stab_nu;
		  count_Pe_stab++;
		  max_Pe_nu=fmax(max_Pe_nu,stab_nu);
		}

	      /* desired viscosity to get Gamma<=1 */
	      stab_nu=stab_alpha*h_max*h_max*norm_guc;
	      if (el_nu<stab_nu)
		{
		  el_nu=stab_nu;
		  count_Ga_stab++;
		  max_Ga_nu=fmax(max_Ga_nu,stab_nu);
		}
	    }
	}
      /* else
	 {
	 beta=0.0;
	 el_nu=nu;
	 }/* */

      /* set elA to zero */
      for (i=0; i<bas_n2_2; i++) 
	{
	  elA[i]=0.0;
	}
      /* set elCm to zero */
      for (i=0; i<dim*bas_n2_3; i++) 
	{
	  elCm[i]=0.0;
	}

      /* set elCp to zero */
      for (i=0; i<bas_n1*bas_n1; i++) 
	{
	  elCp[i]=0.0;
	}


#if (STABTYPE==6)
      /* set elSGstab to zero */
      for (i=0; i<bas_n2_2*dim*dim; i++) 
	{
	  elSGstab[i]=0.0;
	}
#endif

#ifdef TIMETERM
      /* set elM to zero */
      for (i=0; i<bas_n2_2; i++) 
	{
	  elM[i]=0.0;
	}
#endif

      
      /* loop over all integration points */
      for (k=0; k<iform.num_points; k++)
	{
	  /* Jinvgrad2= Jacinv * gradphi2[k,:,:]
	     (=real world gradient T2)
	  */
	  dgemm_( &fNoTrans, &fNoTrans, &fdim, &fbas_n2, &fdim,
		  &done, Jacinv, &fdim, &(gradp2[k*bas_n2*dim]), &fdim,
		  &dzero, Jinvgrad2, &fdim );

	  /* compute du_i/dx_j */
	  for (i=0; i<dim; i++)
	    for (j=0; j<dim; j++)
	      {
		dui_dxj[j*dim+i]=0.0;
		for (l=0; l<bas_n2; l++)
		  {
		    dui_dxj[j*dim+i]+=Jinvgrad2[l*dim+j]
		    *(*u_old).V[(*m).elem[el*el_w+MCT2ELNOD1+l]+i*vx_nr];
		}
	    }

#if (STABTYPE == 6)
	  /* Lphiloc2[i] = sum_d=dim (d phi_i/d x_d)
                = sum_d ( sum_j 
		   ( sum_l hessphi2[k,i,j,l] * Jacinv[l,d] )
		    * Jacinv[j,d]  
		    + some term only important if shapefunctions not linear
		        ) */
	  for (i=0; i<bas_n2; i++)
	    {
	      Lphiloc2[i]=0.0;
	      for (d=0; d<dim; d++)
		{
		  double d2_dxd2;
		  d2_dxd2 = 0.0;
		  for (j=0; j<dim; j++)
		    {
		      double djpart;
		      djpart=0.0;
		      for (l=0; l<dim; l++)
			{
			  djpart += hphi2[k*bas_n2*dim*dim +i*dim*dim 
					  +j*dim +l] * Jacinv[l*dim+d];
			}
		      d2_dxd2 += djpart*Jacinv[j*dim+d];
		    }
		  Lphiloc2[i] += d2_dxd2;
		}
	    }
#endif


	  /* the real wolrd weight of this integration point */
	  weight=fabs(detJac) * iform.weights[k];

	  dhelp=weight*el_nu;
	  /* elA += |detJac|*weigth[k] * nu* Jinvgrad2^T*Jinvgrad2 */
	  dgemm_( &fTrans, &fNoTrans, &fbas_n2, &fbas_n2, &fdim,
		  &dhelp, Jinvgrad2, &fdim, Jinvgrad2, &fdim,
		  &done, elA, &fbas_n2 );


	  /* elCp[i,j]= 1/nu int(element) (sum_d sum_l
	     (u_ld*phi2_l * (d phi1_j/d x_d) * phi1_i))

	     the integration for each entry of elCp :

	     defines Uk[] which is used later
	  */
	  for (d=0; d<dim; d++)
	    {
	      ud=0.0;
	      for (l=0; l<bas_n2; l++)
		{
		  ud += phi2[k*bas_n2+l]
		    *(*u_old).V[(*m).elem[el*el_w+MCT2ELNOD1+l]+d*vx_nr];
		}
	      Uk[d]=ud;
	      ud *= weight/nu;
	      for (j=0; j<bas_n1; j++)
		for (i=0; i<bas_n1; i++)
		  {
		    elCp[j*bas_n1+i] +=
		      ud*Jinvgrad1[j*dim+d]*phi1[k*bas_n1+i];
		  }

	      /* the reaction part of Fp */
	      ud = weight/(dim*nu); /* 1/dim*sum(react terms) */
	      /* ud = weight/nu; /* sum(react terms) */
	      for (j=0; j<bas_n1; j++)
		for (i=0; i<bas_n1; i++)
		  {
		    elCp[j*bas_n1+i] +=
		      (dui_dxj[d*dim+d]
#ifdef TIMETERM
		       +rec_tau
#endif
		       )*phi1[k*bas_n1+j]*phi1[k*bas_n1+i]*ud;
		  } /* */
	    }


#ifdef WAVE_RESOLVE_TEST
#if (STABTYPE==6)
#error "overwriting dui_dxj[0] is incompatible with STABTYPE=6"
#endif
	  /* find the min dui_dxj[*], put it into dui_dxj[0] */
	  for (i=1; i<dim; i++)
	    if (dui_dxj[i*dim+i]<dui_dxj[0]) dui_dxj[0]=dui_dxj[i*dim+i];
	  /* check if we have negative reaction coefficient */
	  if (dui_dxj[0]<0.0)
	    {
	      /* if so, check if we are in oscillating regime */
	      if (abs_uc*abs_uc + 4*nu*dui_dxj[0]<0.0)
		{
		  /* if so, check the wave resolution (wr):
		     lambda = local wave length (derived from 1d equation)
		     wr= lambda/4h,
		     thus, if wr>1 the wavelength should be
		     sufficiently resolvable by the mesh (i.e. with 4 points)
		  */
		  double lambda, wrloc;
		  lambda= 2*M_PI/
		    sqrt(-(0.25*abs_uc*abs_uc/nu + dui_dxj[0])/nu);
		  wrloc = lambda/(4*h_max);

		  /* update the global wave resolution */
		  if (wrloc<wrglob) 
		    {
		      wrglob=wrloc;
		      printf("abs_uc=%e  nu=%e   dui_dxj[0]=%e  lambda=%e\n",
			     abs_uc, nu, dui_dxj[0], lambda);
		    }
		  
		}
	    }
#endif	    


	  /* elCm is a 4 dimensional array, 
	     elCm[d,i,j,l]=elCm[d*bas_n2^3+i*bas_n2^2+j*bas_n2+l]
	                 = int(element)(phi2_l * (d phi2_j/d x_d) * phi2_i)

	     the integration for each entry of elCm :
	  */
	  for (d=0; d<dim; d++)
	    for (i=0; i<bas_n2; i++)
	      for (j=0; j<bas_n2; j++)
		for (l=0; l<bas_n2; l++)
		  {
		    elCm[d*bas_n2_3+i*bas_n2_2+j*bas_n2+l] +=
		      phi2[k*bas_n2+l]*Jinvgrad2[j*dim+d]*phi2[k*bas_n2+i]
		      * weight ;
#if (( (STABTYPE==2) )||( (STABTYPE==3)||(STABTYPE==4) ))
		    /* use SUPG stabilisation, test function
		       phi2[.. +i] is replaced by
		       phi2[.. +i] + beta* (U*gradphi2[i]),
		       here we add the later term */
		    for (r=0; r<dim; r++)
		      {
			elCm[d*bas_n2_3+i*bas_n2_2+j*bas_n2+l] +=
			  phi2[k*bas_n2+l]*Jinvgrad2[j*dim+d]
			  *beta*Uk[r]*Jinvgrad2[i*dim+r] * weight ;
		      }
#endif
		  }

#if (STABTYPE==6)
	  /* use full SUPG stabilisation, add stabilisation term
	     int(beta*(-nu*Lap(u)+w^T grad(u)+c u )
	         *(nu*Lap(v)+w^T grad(v)-c v) ) 

	     w=Uk[.] 
	     c=dui_dxj[j*dim+i] 
	  */
	  for (r=0; r<dim; r++)    /* block row of matrix   (v_r) */
	    for (l=0; l<dim; l++)  /* block col of matrix   (u_l) */
	      for (i=0; i<bas_n2; i++)   /* row of block  (v_r)_i */
		for (j=0; j<bas_n2; j++) /* col of block  (u_l)_j */
		  {
    		    double wTgradU=0.0;
    		    double wTgradV=0.0;
		    double sumc=0.0; /* sum of reaction coefficients
					of this row */
		    double c_rl=0.0; /* reaction coefficient of this
					block */

		    double delta_rl=0.0;
		    if (l==r) delta_rl=1.0;

		    for (d=0; d<dim; d++) /* inner loop for w^T grad */
		      {
			wTgradU+=Uk[d]*Jinvgrad2[j*dim+d];
			wTgradV+=Uk[d]*Jinvgrad2[i*dim+d];
			sumc += dui_dxj[d*dim+r];
		      }
		    c_rl=dui_dxj[l*dim+r];

		    if (lintype==2)
		      {
			/* reaction terms not present, c=0 */
			sumc=0.0;
			c_rl=0.0;
		      }

		    elSGstab[l*dim*bas_n2_2+r*bas_n2_2+j*bas_n2+i] +=
		      beta*(delta_rl*(-nu*Lphiloc2[j]+wTgradU)
				      +c_rl*phi2[k*bas_n2+j])
		      *(-nu*Lphiloc2[i]+wTgradV-sumc*phi2[k*bas_n2+i])
		      * weight ;
		    /* elSGstab[l*dim*bas_n2_2+r*bas_n2_2+j*bas_n2+i] +=
		       beta*(delta_lr*wTgradU)
		       *(wTgradV) * weight ; /* only Stab2 */

		  }
#endif


#ifdef TIMETERM
	  for (j=0; j<bas_n2; j++)
	    for (i=0; i<bas_n2; i++)
	      {
		elM[j*bas_n2+i] +=
		  rec_tau*phi2[k*bas_n2+i]*phi2[k*bas_n2+j]
		  * weight ;
	      }
#endif

	}

      /***************************************************************
       * elA, elCm and elCp are ready, now first assemble Fs and Cp, *
       * then if needed add Cu and correct the rhs                   *
       ***************************************************************/

      /* Cw in its two identical diagonal parts */
      /* clear addC */
      for (i=0; i<bas_n2_2; i++)
	{ addC[i]=0.0; }
      
      /* build addC */
      for (i=0; i<bas_n2; i++)
	for (j=0; j<bas_n2; j++)
	  {
	    for (d=0; d<dim; d++)
	      for (l=0; l<bas_n2; l++)
		{
		  addC[j*bas_n2+i]+=
		    elCm[d*bas_n2_3+i*bas_n2_2+j*bas_n2+l]
		    *(*u_old).V[d*vx_nr+(*m).elem[el*el_w+MCT2ELNOD1+l]];

#ifdef CONSERV_ADV_NEWT
		  addC[j*bas_n2+i]+=
		    elCm[d*bas_n2_3+i*bas_n2_2+l*bas_n2+j]
		    *(*u_old).V[d*vx_nr+(*m).elem[el*el_w+MCT2ELNOD1+l]];
#endif
		}
	  }

      /* now add elA and this addC to Fs */
      for (line=0; line<dim; line++)
	{
	  /* the dofs to which this block corresponds */
	  for (i=0; i<bas_n2; i++)
	    { dofs1[i]=(*m).elem[el*el_w+MCT2ELNOD1+i]+line*vx_nr; }
	  
	  /* add elA to Fs */
	  err=sparse_add_local(Fs, NoTrans,
			       bas_n2, dofs1, bas_n2, dofs1,
			       elA, bas_n2 );
	  /* add addC (=Cw) to Fs */
	  err=sparse_add_local(Fs, NoTrans,
			       bas_n2, dofs1, bas_n2, dofs1,
			       addC, bas_n2 );
	  FUNCTION_FAILURE_HANDLE( err, sparse_add_local,
				   navsto_C_assem_t21);

#ifdef TIMETERM
	  /* add elM to Fs */
	  err=sparse_add_local(Fs, NoTrans,
			       bas_n2, dofs1, bas_n2, dofs1,
			       elM, bas_n2 );

	  if (Fsonly!=1)
	    {
	      /* add elM*w to rhs */
	      for (i=0; i<bas_n2; i++)
		for (j=0; j<bas_n2; j++)
		  {
		    (*rhs).V[dofs1[i]]+= 
		      elM[j*bas_n2+i]*(*u_old).V[dofs1[j]];
		  }
	    }
#endif
	}

      if (Fsonly!=1)
	{
	  /* assemble Cp */
	  if ((lintype==2)||(lintype==1))
	    {
	      for (i=0; i<bas_n1; i++)
		{
		  elpdofs[i]=(*K).pdof[(*m).elem[el*el_w+MCT2ELNOD1+i]];
		}

	      err=sparse_add_local(&(*K).Cp, NoTrans,
				   bas_n1, elpdofs, bas_n1, elpdofs,
				   elCp, bas_n1 );
	      FUNCTION_FAILURE_HANDLE( err, sparse_add_local,
				       navsto_C_assem_t21);
	    }
	}


      /* if needed add Cu and rhs */
      if (lintype==1)
	{
	  for (line=0; line<dim; line++)
	    {
	      /* the dofs to which this block line corresponds */
	      for (i=0; i<bas_n2; i++)
		{ dofs1[i]=(*m).elem[el*el_w+MCT2ELNOD1+i]+line*vx_nr; }
	      
	      for (col=0; col<dim; col++)
		{
		  /* the dofs to which this block column corresponds */
		  for (i=0; i<bas_n2; i++)
		    { dofs2[i]=(*m).elem[el*el_w+MCT2ELNOD1+i]+col*vx_nr; }
		  
		  /* clear addC */
		  for (i=0; i<bas_n2_2; i++)
		    { addC[i]=0.0; }
		  
		  /* build addC */
		  d=col;
		  for (i=0; i<bas_n2; i++)
		    for (j=0; j<bas_n2; j++)
		      for (l=0; l<bas_n2; l++)
			{
			  addC[j*bas_n2+i]+=
			    elCm[d*bas_n2_3+i*bas_n2_2+l*bas_n2+j]
			    *(*u_old).V[dofs1[l]];

#ifdef CONSERV_ADV_NEWT
			  addC[j*bas_n2+i]+=
			    elCm[d*bas_n2_3+i*bas_n2_2+j*bas_n2+l]
			    *(*u_old).V[dofs1[l]];
#endif
			}


#ifdef DIAG_ONLY_COARSE
		  if ((Fsonly!=1)||(col==line))
		    {
		      /* add addC (=Cu) to Fs */
		      err=sparse_add_local(Fs, NoTrans,
					   bas_n2, dofs1, bas_n2, dofs2,
					   addC, bas_n2 );
		      FUNCTION_FAILURE_HANDLE( err, sparse_add_local,
					       navsto_C_assem_t21);
		    }
#else		  
		  /* add addC (=Cu) to Fs */
		  err=sparse_add_local(Fs, NoTrans,
				       bas_n2, dofs1, bas_n2, dofs2,
				       addC, bas_n2 );
		  FUNCTION_FAILURE_HANDLE( err, sparse_add_local,
					   navsto_C_assem_t21);
#endif
		  if (Fsonly!=1)
		    {
		      /* add Cu*w to rhs */
		      for (i=0; i<bas_n2; i++)
			for (j=0; j<bas_n2; j++)
			  {
			    (*rhs).V[dofs1[i]]+= 
			      addC[j*bas_n2+i]*(*u_old).V[dofs2[j]];
			  }
		    }
		} /* end col */
	    } /* end line */
	} /* end if lintype==1 */


#if (STABTYPE==6)
  /* add the stabilastion terms to the stiffness matrix */
      for (line=0; line<dim; line++)
	{
	  /* the dofs to which this block line corresponds */
	  for (i=0; i<bas_n2; i++)
	    { dofs1[i]=(*m).elem[el*el_w+MCT2ELNOD1+i]+line*vx_nr; }
      
	  for (col=0; col<dim; col++)
	    {
	      /* the dofs to which this block column corresponds */
	      for (i=0; i<bas_n2; i++)
		{ dofs2[i]=(*m).elem[el*el_w+MCT2ELNOD1+i]+col*vx_nr; }
		  
	      /* add elSGstab to Fs */
	      err=sparse_add_local(Fs, NoTrans,
				   bas_n2, dofs1, bas_n2, dofs2,
				   &elSGstab[col*dim*bas_n2_2+line*bas_n2_2],
				   bas_n2 );
	      FUNCTION_FAILURE_HANDLE( err, sparse_add_local,
				       navsto_C_assem_t21);

	      if ((line==0)&&(col==0)&&(el==1)&&(1==1))
		{
		  printf("elSGstab=\n");
		  for (i=0; i<bas_n2; i++)
		    {
		      for (j=0; j<bas_n2; j++)
			printf(" %+9.2e  ", elSGstab[col*dim*bas_n2_2
						     +line*bas_n2_2
						     +j*bas_n2+i]);
		      printf("\n");
		    }
		}
	    } /* end col */
	} /* end line */
#endif

    } /* end loop over all elements */


  /* if needed correct the rhs (project it for the boundary conditions) */
  if (lintype==1)
    {
      if (Fsonly!=1)
	{
	  /* project the righthand side */
	  for (i=0; i<(*K).bn_nr; i++)
	    {
	      for (j=0; j<dim; j++)
		{
		  (*rhs).V[(*K).nodes[i]+j*vx_nr]=0.0;
		}
	    }
	}
    }

  /* printf("lvl=%d,  maxPe=%8.1e\n", (int) lvl, *maxPe); /* */

  if ( (*Fs).type == SP_TYPE_FLEX )
    {
      err = sparse_convert_compressed_row( Fs );
      FUNCTION_FAILURE_HANDLE( err, sparse_convert_compressed_row, 
			       navsto_C_assem_t21 );
    }

  if (Fsonly!=1)
    {
      if ( (*K).Cp.type == SP_TYPE_FLEX )
	{
	  err = sparse_convert_compressed_row( &(*K).Cp );
	  FUNCTION_FAILURE_HANDLE( err, sparse_convert_compressed_row, 
				   navsto_C_assem_t21 );
	}
    }

  /* free local data */
  free_intdata (&iform);
  free(phic);
  free(gphic);
  free(Uk);
  free(elA);
  free(elCm);
  free(elCp);
  free(addC);
  free(Jac);
  free(Jacinv);
  free(Jinvgrad1);
  free(Jinvgrad2);
  free(dofs1);
  free(dofs2);
  free(elpdofs);

#ifdef TIMETERM
  free(elM);
#endif
  
#if (STABTYPE==6)
  free(Lphiloc2);
  free(elSGstab);
#endif


#ifdef WAVE_RESOLVE_TEST
  printf("wrglob=%e\n", wrglob);
#endif

  if (artvis==1)
    {
      printf("\n  c_Pe=%4d  c_Ga=%4d   c_no=%4d    m_nu_Pe=%8.1e"
	     "   m_nu_Ga=%8.1e ",
	     (int) count_Pe_stab, (int) count_Ga_stab,
	     (int)  ( (*m).el_nr-count_Pe_stab-count_Ga_stab),
	     max_Pe_nu, max_Ga_nu);
    }
  else printf("\n");

  return SUCCESS;
}
  
  
/*FUNCTION*/
int navsto_MG_init_assem_t21(struct mesh *m1, FIDX lvl,
			     struct navsto_matrix *K, 
			     struct vector *rhs_stokes,
			     struct vector *rhs,
			     struct vector *u0,
			     struct mesh *m2,
			     double *vx_new, FIDX vx_max
/* performs the assembly of the non-changing parts of the stiffness
   matrix K and the right hand side vector rhs which result from
   linearised finite element discretisation of the Navier Stokes
   equation on the T2 mesh m2 (build from T1 mesh m1 by refinement),
   such that 

           K x = rhs

   defines the new iterate (approximate) solution x=(u_x, u_y, p) of
   the Navier Stokes equation

         -nu Laplace(u) + u * grad u + grad p = f
                                        div u = 0

   with boundary conditions as given in the mesh m    
   (p2 triangles)

   The linear terms are identical to the discretisation of a
   Stokes system, so mainly the Stokes assembly is called
   here. However, initialisation of the remaining parts is performed
   here as well. 

   Input:  m1      - the coarse mesh (T1)
           lvl     - (new) current level
	   vx_new  - overriding set of node positions for T2 meshes
                     ignored if ==NULL
	   vx_max  - maximum number of nodes for use of vx_max
                     ignored if vc_new==NULL


   Output: K       - stiffness matrix in specialised storage, the
                     structure K is initialised here, memory
                     allocated and during successive iterations
                     navsto_C_assem_MG_t21 has to be used to update
                     the changing parts of the system
           rhs_stokes
                   - the righthand side of the Stokes system, only the
                     vector data structure needs to be given, it is
		     initialised here
           rhs     - the righthand side of the linearised Navier
                     Stokes system, only the vector data structure
                     needs to be given, it is initialised here, on
                     exit it will be identical to rhs_stokes
	   u0      - a initial guess for the solution which fulfils
         	     the boundary conditions
	   m2      - the refined T2 mesh



   Return: SUCCESS - success
           FAIL    - failure, see error message, output will not be
                     valid
*/
		       ){
  FIDX i, tl;
  int  err;
  double *vx_tmp=NULL;

  /* protect vx_new from being freed */
  if (vx_new== m2->vertex) m2->vertex=NULL;

  mesh_free( m2);

  /* restart with coarse t2 mesh */
  err=mesh_t1_to_t2(m1, m2);
  FUNCTION_FAILURE_HANDLE( err, mesh_t1_to_t2, navsto_MG_init_assem_t21);
  
  /* override node positions */
  if (vx_new != NULL)
    {
      vx_tmp     = m2->vertex;
      if (m2->vx_nr <= vx_max)
	m2->vertex = vx_new;
      else
	{ 
	  fprintf(stderr, "navsto_MG_init_assem_t21: "
		  "vx_max insufficient!\n");
	  return FAIL;
	}
    }
  
  for (tl=0; tl<=lvl; tl++)
    {
#if (((STABTYPE==0)||(STABTYPE==1))||((STABTYPE==2)||(STABTYPE==5)) \
      ||(STABTYPE==6))
      if(tl>0)   /* coarse meshes 2nd order */
#elif ((STABTYPE==3)||(STABTYPE==4))
      if(tl<lvl) /* coarse meshes 1st order (submesh of 2nd order) */
#else
#error "STABTYPE unknown"
#endif
	{

	  /* protect vx_new */
	  if (vx_new != NULL) m2->vertex = vx_tmp;

	  err=mesh_refine_uniform_t2(m2);
	  FUNCTION_FAILURE_HANDLE( err, mesh_refine_uniform_t2,
				   navsto_MG_init_assem_t21);

	  /* override node positions */
	  if (vx_new != NULL)
	    {
	      vx_tmp     = m2->vertex;
	      if (m2->vx_nr <= vx_max)
		m2->vertex = vx_new;
	      else
		{ 
		  fprintf(stderr, "navsto_MG_init_assem_t21: "
			  "vx_max insufficient!\n");
		  return FAIL;
		}
	    }
	}

      if ( (*K).Fs[tl].row_nr==0)
	{
	  err=sparse_alloc( &(*K).Fs[tl], (*m2).dim * (*m2).vx_nr,
			    CBLOCKDENSITY );
	  FUNCTION_FAILURE_HANDLE( err, sparse_alloc,
				   navsto_MG_init_assem_t21);
	}
    }
  err=stokes_assem_t21( K, rhs_stokes, u0, m2, lvl, 1);
  FUNCTION_FAILURE_HANDLE( err, assem_stokes_t21,
			   navsto_MG_init_assem_t21);

  err=vector_alloc( rhs, (*rhs_stokes).len);
  FUNCTION_FAILURE_HANDLE( err, vector_alloc, navsto_MG_init_assem_t21);
  for (i=0; i<(*rhs).len; i++)
    { (*rhs).V[i]=(*rhs_stokes).V[i]; }


  err=sparse_alloc(&(*K).Ap, (*K).pvx_nr, ABLOCKDENSITY );
  FUNCTION_FAILURE_HANDLE( err, sparse_alloc, navsto_MG_init_assem_t21);

  err=sparse_alloc(&(*K).Cp, (*K).pvx_nr, ABLOCKDENSITY );
  FUNCTION_FAILURE_HANDLE( err, sparse_alloc, navsto_MG_init_assem_t21);

  err=assem_poison_t1_t2(&(*K).Ap, (*K).pvx_nr, (*K).pdof, m2);
  FUNCTION_FAILURE_HANDLE( err, assem_poison_t1_t2,
			   navsto_MG_init_assem_t21);

#ifdef LAPLACIAN_COARSE_GRID_SOLVER
  /* if lvl==0 initialise coarse grid solver for the Laplacian */
  printf("lvl=%d\n", (int) lvl);
  if (lvl==0)
    {
      FIDX p_bn_nr;
      /* alloc space for cmat_Ap and cmat_Ap_pdof */
      TRY_MALLOC((*K).cmat_Ap, 1, struct coarse_mat,
		 navsto_MG_init_assem_t21);
      TRY_MALLOC((*K).cmat_Ap_pdof, (*K).vx_nr, FIDX,
		 navsto_MG_init_assem_t21);

      /* due to lack of appropriate boundary conditions, 
	 define the value at one point to be zero */
      (*K).cmat_Ap_pdof[0]=0;
      p_bn_nr=1;

      err= coarse_mat_set( &(*K).Ap, p_bn_nr, (*K).cmat_Ap_pdof,
			   1, (*K).cmat_Ap );
      FUNCTION_FAILURE_HANDLE( err, coarse_mat_set,
			       navsto_MG_init_assem_t21);
      
      /* now copy the correct pdof into cmat_Ap_pdof and set the
	 according numbers */
      for (i=0; i< K->vx_nr; i++) 
	{
	  K->cmat_Ap_pdof[i]= K->pdof[i];
	}
      K->cmat_Ap_vxnr= K->vx_nr;
      K->cmat_Ap_pvxnr= K->pvx_nr;
      /* done */
    }
#endif   /* LAPLACIAN_COARSE_GRID_SOLVER */

  if ((*K).mld==NULL) 
    {
      TRY_MALLOC( (*K).mld, 1, struct multilvl, 
		  navsto_MG_init_assem_t21);
    }
  if ((*K).mg==NULL) 
    {
      TRY_MALLOC( (*K).mg, 1, struct mgdata, 
		  navsto_MG_init_assem_t21);
      mg_null_def( (*K).mg);
    }

  err=multilvl_init_t2( m2, (*m2).dim, (*K).mld);
  FUNCTION_FAILURE_HANDLE( err, multilvl_init_t2,
			   navsto_MG_init_assem_t21);

  /* initialise the sorters */
  TRY_MALLOC( (*K).mlsorters, 2, FIDX*, navsto_MG_init_assem_t21);
  for (i=0; i<(*m2).dim; i++)
    {
      TRY_MALLOC( (*K).mlsorters[i], (*(*K).mld).nlevl[0], FIDX,
		  navsto_MG_init_assem_t21);

      err=navsto_dofsorter_t21( m2,  (*K).mld, i, (*K).mlsorters[i]);
      FUNCTION_FAILURE_HANDLE( err, navsto_dofsorter_t21,
			       navsto_MG_init_assem_t21);
    }

  /* release the temporary vertex positions */
  if (vx_new!=NULL) free(vx_tmp);

  return SUCCESS;
}


/*FUNCTION*/
int navsto_MG_C_assem_t21(struct mesh *m1, FIDX lvl,
			  struct navsto_matrix *K, struct vector *rhs,
			  double *maxPe,
			  struct vector *rhs_stokes,
			  struct vector *u_old,
			  struct mesh *m2, double *vx_new, FIDX vx_max,
			  int lintype
/* repeatetly calls navsto_C_assem_t21 to perform the assembly of the
   changing part Fs of the stiffness matrix K and all its coarse
   grid versions and the right hand side vector rhs which result from
   linearised finite element discretisation of the Navier Stokes
   equation on the T2 mesh m2, which results from refinement of the T1
   mesh m1, such that  

           K x = rhs

   defines the new iterate (approximate) solution x=(u_x, u_y, p) of
   the Navier Stokes equation

   for the description of the linearisations see navsto_C_assem_t21

   Input:  m1      - the coarse mesh (T1)
           lvl     - the current refinement level
           rhs_stokes
                   - the righthand side of the Stokes system, as
                     obtained by navsto_init_assem_t21
	   u_old   - the previous iterate solution
	   vx_new  - overriding set of node positions for T2 meshes
                     ignored if ==NULL
	   vx_max  - maximum number of nodes for use of vx_max
                     ignored if vc_new==NULL
	   lintype - the type of linearisation, see navsto_C_assem_t21
	             for description
           
   In/Out: K       - stiffness matrix in specialised storage, the
                     structure K has to be initialised by
                     navsto_init_assem_t21 once, and afterwards it is
                     just updated (the Fs part, which is the only part
                     that changes between iterations) in this routine

   Output: m2      - the fine mesh (T2)
	   rhs     - righthand side vector, has to be initialised by
                     navsto_init_assem_t21 once, and afterwards it is
                     just updated in this routine
	   maxPe   - max element Peclet number Pe for the finest mesh
	             (maxPe>1 means the FE discretisation on this mesh
	             is unstable)



   Return: SUCCESS - success
           FAIL    - failure, see error message, output will not be
                     valid
*/
		     ){
  FIDX i, j, d, lvx_nr, tl, dim;
  int  Fsonly, err;
  double nu_lvl;
  double *vx_tmp=NULL;

  dim = m1->dim;

  /* free old stuff */
  mg_free ((*K).mg);

  /* protect vx_new from being freed */
  if (vx_new== m2->vertex) m2->vertex=NULL;
  mesh_free( m2);

  /* restart with coarse t2 mesh */
  err=mesh_t1_to_t2(m1, m2);
  FUNCTION_FAILURE_HANDLE( err, mesh_t1_to_t2, navsto_MG_C_assem_t21);

  /* override node positions */
  if (vx_new != NULL)
    {
      vx_tmp     = m2->vertex;
      if (m2->vx_nr <= vx_max)
	m2->vertex = vx_new;
      else
	{ 
	  fprintf(stderr, "navsto_MG_C_assem_t21: "
		  "vx_max insufficient!\n");
	  return FAIL;
	}
    }

  nu_lvl=m1->para[MC2XPANUPO];
  
  for (tl=0; tl<=lvl; tl++)
    {
      lvx_nr = m2->vx_nr;

#if (((STABTYPE==0)||(STABTYPE==1))||((STABTYPE==2)||(STABTYPE==5)) \
      ||(STABTYPE==6))
      if(tl>0)   /* coarse meshes 2nd order */
#elif ((STABTYPE==3)||(STABTYPE==4))
      if(tl<lvl) /* coarse meshes 1st order (submesh of 2nd order) */
#else
#error "STABTYPE unknown"
#endif
	{
	  /* protect vx_new */
	  if (vx_new != NULL) m2->vertex = vx_tmp;

	  err=mesh_refine_uniform_t2(m2);
	  FUNCTION_FAILURE_HANDLE( err, mesh_refine_uniform_t2,
				   navsto_MG_init_assem_t21);
	  /* override node positions */
	  if (vx_new != NULL)
	    {
	      vx_tmp     = m2->vertex;
	      if (m2->vx_nr <= vx_max)
		m2->vertex = vx_new;
	      else
		{ 
		  fprintf(stderr, "navsto_MG_C_assem_t21: "
			  "vx_max insufficient!\n");
		  return FAIL;
		}
	    }
	}

#if (STABTYPE==1)
      /* coarse meshes have lower Re (== higer nu) */
      if (tl<lvl)
	m2->para[MC2XPANUPO]=nu_lvl*pow(2.0, (double) lvl-tl);
      else
	m2->para[MC2XPANUPO]=nu_lvl;
#endif

      /* if errors occur here due to the size of K.Fs[tl], it may be
	 due to the mixup created by STABTYPE==3, try =2 instead */
      if(tl!=lvl) Fsonly=1; else Fsonly=0;
      err=navsto_C_assem_t21( K, rhs, maxPe, rhs_stokes, u_old, m2,
			      tl, Fsonly, lintype); /* */
      FUNCTION_FAILURE_HANDLE( err, navsto_C_assem_t21,
			       navsto_MG_C_assem_t21);

#ifdef MG_F_NO_CMAT
#warning "no exact coarse-grid-solver for F-part of F_p"
#else
      if(tl==0)
	{
	  FIDX ndiri, *diris;
	  TRY_MALLOC( diris, lvx_nr*(*m2).dim, FIDX,
		      navsto_MG_C_assem_t2);
	  coarse_mat_free( &(*K).cmat);
	  ndiri=0;
	  for (i=0; i<(*K).bn_nr; i++)
	    {
	      FIDX node;
	      node=(*K).nodes[i];
	      if (node<lvx_nr)
		for (d=0; d<dim; d++)
		  {
		    diris[ndiri]=node+d*lvx_nr;
		    ndiri++;
		  }
	    }


	  /* set the coarse grid matrix */
	  err=coarse_mat_set( &(*K).Fs[tl], ndiri, diris, 2,
			      &(*K).cmat); 
	  FUNCTION_FAILURE_HANDLE( err, coarse_mat_set,
				   navsto_MG_C_assem_t21);
	  free(diris);
	}
#endif
    }

  err=mg_init_tx( (*K).Fs, m2, (*K).mld, (*K).mg, NULL );
  FUNCTION_FAILURE_HANDLE( err, mg_init_tx, navsto_MG_C_assem_t21);

  err=navsto_velssorter_init_t21( m2, (*K).mld, (*K).mg );
  FUNCTION_FAILURE_HANDLE( err, navsto_velssorter_init_t21,
			   navsto_MG_C_assem_t21);

#ifdef MG_F_NO_DIAGPROJ
#warning "no projection of invdiag for MG for F-part of F_p"
#else
  for (i=0; i<(*K).bn_nr; i++)
    for (j=0; j<=(*K).mld->lmax; j++)
      for (d=0; d<(*K).mld->dim; d++)
	{
	  FIDX node;
	  node=d*(*m2).vx_nr+(*K).nodes[i];
	  MLVLFINDENTRY(node, node, j, *(*K).mld);
	  if (node>=0)
	    {
	      (*K).mg->invdiag[node]=0.0;
	    }
	}
#endif

  /* reset the inner counter and step */
  for (i=0; i<5; i++)
    {
      (*K).innercount[i]=0;
      (*K).innersteps[i]=-1;
    }

  /* reset the MGscale */
  for (i=0; i<=lvl; i++)
    (*K).MGscale[i]=0.0;

  /* release the temporary vertex positions */
  if (vx_new!=NULL) free(vx_tmp);
  
  return SUCCESS;
}

/*FUNCTION*/
int navsto_MG_ref_intp_ass_t21(struct mesh *m1, FIDX lvl,
			       struct mesh *m2,
			       double *vx_new, FIDX vx_max,
			       struct navsto_matrix *K, 
			       struct vector *rhs_stokes,
			       struct vector *rhs,
			       struct vector *uc,
			       struct vector *uf
/* performs uniform refinement of the mesh, reinitialises all data for
   the finer mesh, navsto_init_assem_t21 on the finer and all coarse
   meshes, interpolates the given coarse solution uc to a finer
   solution uf which fulsfils the boundary conditions of the finer
   mesh

   Input:  m1      - the coarse mesh
           lvl     - the new level for the fine mesh
	   vx_new  - overriding set of node positions for T2 meshes
	   vx_max  - maximum number of nodes for use of vx_max

   In/Out: m2      - the mesh, will be refined
           K       - stiffness matrix in specialised storage, the
                     structure K is reinitialised here, memory
                     reallocated and during successive iterations
                     navsto_MG_C_assem_t21 has to be used to update the
                     changing parts of the system
           rhs_stokes
                   - the righthand side of the Stokes system, it is
		     reinitialised here
           rhs     - the righthand side of the linearised Navier
                     Stokes system, it is reinitialised here, on
                     exit it will be identical to rhs_stokes
	   uc      - the solution on the current mesh m, is
               	     reinitialised to size for the new mesh
           uf      - the solution uc interpolated to a solution on the
                     new fine mesh, adjusted to fulfil the boundary
                     conditions, the given vector is reinitialised to
                     the new size



   Return: SUCCESS - success
           FAIL    - failure, see error message, output will not be
                     valid
*/
			    ){
  int  err;
  FIDX i, j, k, d;
  FIDX fat1, fat2, chld;
  FIDX leveltop, dim, vx_nrc, vx_nrf, hi_w, eh_w, eg_w;

  FIDX *pdofc, pvx_nrc;
  FIDX *pdoff, pvx_nrf;

  FIDX bas_n;
  double *phi, *gradphi, *hessphi;
  double points[9*2]={ 0.25, 0.0,
		       0.25, 0.25,
		       0.0,  0.25,
		       0.75, 0.25,
		       0.5,  0.25,
		       0.75, 0.0,
		       0.0,  0.75,
		       0.25, 0.5,
		       0.25, 0.75 };

  struct vector utmp;

  double *vx_tmp;

  /* initialise common information */
  dim    = (*m2).dim;
  hi_w   = (*m2).hi_w;
  eh_w   = (*m2).eh_w;
  eg_w   = (*m2).eg_w;

  /* initialise interpolation info */
  err= eval_basis( dim, tria, 2,
		   9, points,
		   &bas_n, &phi, &gradphi, &hessphi);
  FUNCTION_FAILURE_HANDLE( err, eval_basis, navsto_MG_ref_intp_ass_t21);
  free( gradphi );
  free( hessphi );

  /* save the information we still need for the interpolation  */
  vx_nrc = (*m2).vx_nr;
  pdofc  = (*K).pdof;
  pvx_nrc= (*K).pvx_nr;
  /* prevent the array from being freed, as long as we still need it */
  (*K).pdof=NULL;


  /* check sanity */
  if ((*uc).len!=dim*vx_nrc+pvx_nrc)
    {
      fprintf(stderr,
	      "navsto_MG_ref_intp_ass_t21: dimensions of uc don't "
	      "match the mesh!\n");
      (*K).pdof=pdofc;
      return FAIL;
    }


  /* free the data not needed anymore */
  vector_free(rhs_stokes);
  vector_free(rhs);
  vector_free(uf);

  
  /* protect vx_new */
  if (vx_new==m2->vertex)
    {
      FIDX vx_size= m2->vx_w*m2->vx_nr;
      TRY_MALLOC(vx_tmp, m2->vx_w*m2->vx_max, double,
		 navsto_MG_ref_intp_ass_t21);
      /* copy to vx_tmp */
      for (i=0; i<vx_size; i++)
	vx_tmp[i] = vx_new[i];
      
      m2->vertex=vx_tmp;
    }

  /* refine the mesh */
  err=mesh_refine_uniform_t2( m2 );
  FUNCTION_FAILURE_HANDLE( err, mesh_refine_uniform_t2,
			   navsto_MG_ref_intp_ass_t21);

  /* override node positions */
  if (vx_new != NULL)
    {
      vx_tmp     = m2->vertex;
      if (m2->vx_nr <= vx_max)
	m2->vertex = vx_new;
      else
	{ 
	  fprintf(stderr, "navsto_MG_ref_intp_ass_t21: "
		  "vx_max insufficient!\n");
	  return FAIL;
	}
      /* can allready free vx_tmp */
      free(vx_tmp);
      vx_tmp=NULL;
    }

  /* initialise the non-changing parts of the stiffness matrix,
     dependent on the type of the matrix (SDFEM or F_p tuned) */
  if ( (*K).Bs[0].row_nr != 0)
    {
      /* old F_p tuned type */
      err= navsto_MG_init_assem_t21(m1, lvl, K, rhs_stokes, rhs,
				    &utmp, m2, vx_new, vx_max);
      FUNCTION_FAILURE_HANDLE( err, navsto_MG_init_assem_t21,
			       navsto_MG_ref_intp_ass_t21);

      leveltop= (*(*K).mld).lmax;

      printf("leveltop=%d    lvl=%d\n", (int) leveltop, (int) lvl);
    }
  else
    {
      /* SDFEM type */
      err=navsto_MG_init_assem_SDFEM_t21( m1, lvl, K, rhs,
					  &utmp, m2, vx_new, vx_max);
      FUNCTION_FAILURE_HANDLE( err, navsto_MG_init_assem_SDFEM_t21, 
			       navsto_MG_ref_intp_ass_t21);

      err=vector_alloc( rhs_stokes, utmp.len );
      FUNCTION_FAILURE_HANDLE( err, vector_alloc,
			       navsto_MG_ref_intp_ass_t21);
      for (i=0; i<(*rhs_stokes).len; i++) { (*rhs_stokes).V[i]=0.0; }

      leveltop= lvl;
    }

  err=vector_alloc( uf, utmp.len );
  FUNCTION_FAILURE_HANDLE( err, vector_alloc, navsto_MG_ref_intp_ass_t21);

  pdoff  = (*K).pdof;
  pvx_nrf= (*K).pvx_nr;
  vx_nrf = (*m2).vx_nr;


  /* copy all the old velocity components */
  for (i=0; i<vx_nrc; i++)
    for (j=0; j<dim; j++)
      { (*uf).V[i+j*vx_nrf]=(*uc).V[i+j*vx_nrc]; }

  /* copy all the old pressure components */
  for (i=0; i<vx_nrc; i++)
    { 
      if (pdofc[i]>=0)
	{ 
	  (*uf).V[pdoff[i]+dim*vx_nrf]=
	    (*uc).V[pdofc[i]+dim*vx_nrc]; 
	}
    }
  
  /* interpolate the new components, which are all generated
     as childs of a certain level of hierarchy entries */
  for (i=0; i<(*m2).eh_nr; i++)
    {
      if ((*m2).elhier[i*eh_w+MCT2EHLVL]==leveltop)
	{
	  /* interpolate velocity components */
	  for(k=0; k<9; k++) /* for each child of this elhier */
	    {
	      chld=(*m2).elhier[i*eh_w+MCT2EHCHL1+k];
	      /* reset to zero */
	      (*uf).V[chld       ]=0.0;
	      (*uf).V[chld+vx_nrf]=0.0;
	      for(j=0; j<bas_n; j++) /* for each father of this elhier */
	      {
		fat1=(*m2).elhier[i*eh_w+MCT2EHFAT1+j];
		for (d=0; d<dim; d++)
		  {
		    (*uf).V[chld+d*vx_nrf]+=
		      phi[k*bas_n+j]*(*uc).V[fat1+d*vx_nrc];
		  }	  
	      }
	    }
	}
    }
  for (i=0; i<(*m2).hi_nr; i++)
    {
      if ((*m2).hier[i*hi_w+MCT2HILVL]==leveltop-1)
	{
	  /* interpolate pressure components */
	  fat1=pdofc[(*m2).hier[i*hi_w+MCT2HIFAT1  ]];
	  fat2=pdofc[(*m2).hier[i*hi_w+MCT2HIFAT1+1]];
	  chld=pdoff[(*m2).hier[i*hi_w+MCT2HICHLD  ]];

#ifdef DEBUGFEINS
	  if (((fat1<0)||(fat2<0))||(chld<0))
	    {
	      fprintf(stderr,
		      "navsto_MG_ref_intp_ass_t21: pressure hierarchy "
		      "wrong?\n");
	      return FAIL;
	    }
#endif
	
	  (*uf).V[chld+dim*vx_nrf]=
	    0.5*((*uc).V[fat1+dim*vx_nrc]+(*uc).V[fat2+dim*vx_nrc]);
	}
    }

  /* now correct the velocities to match the boundary conditions,
     simply copy the velocities of all bc nodes from utmp */
  for (i=0; i<(*K).bn_nr; i++)
    {
      chld=(*K).nodes[i];
      for (j=0; j<dim; j++)
	{ (*uf).V[chld+j*vx_nrf]=utmp.V[chld+j*vx_nrf];	}
    }
  
  /* reinitialise uc */
  vector_free(uc);
  err=vector_alloc( uc, (*uf).len );
  FUNCTION_FAILURE_HANDLE( err, vector_alloc, navsto_MG_ref_intp_ass_t21);


  /* free data belonging to the old mesh */
  if ( (*K).Bs[0].row_nr != 0)
    {
      free(pdofc);
    }

  /* free temporary data */
  free(phi);
  vector_free(&utmp);

  return SUCCESS;
}




/*FUNCTION*/
int navsto_maxnu_init_t21(struct mesh *m1,
			  double *nu_max
/* solves the Navier-Stokes equations on the coarse grid (with low
   accuracy) to get an estimate for the highest possible Reynolds
   number on the coarse mesh

   Input:  m1      - the coarse mesh (T1)

   Output: nu_max  - estimate for the highest Re for which the
                     discretisation on the coarse mesh is stable



   Return: SUCCESS - success
           FAIL    - failure, see error message, output will not be
                     valid
*/
			  ){
  int  err, iter, linmeth;
  FIDX i;
  struct mesh msh2;
  struct navsto_matrix K;
  struct vector rhs_stokes, rhs, *x, *x_old;
  double maxPe, nu, nu_keep, resi;

  linmeth=1;
  nu=1e2;

  nu_keep=m1->para[MC2XPANUPO];

  x=NULL;
  x_old=NULL;

  /* null define msh2 */
  mesh_init( &msh2);


  /* do a few iterations of that to keep the influence of the low
     trial Re low */
  for (i=0; i<1; i++)
    {
      m1->para[MC2XPANUPO]=nu;

      /* init the matrix struct */
      err=navsto_matrix_init( &K, m1->dim, 1);
      FUNCTION_FAILURE_HANDLE( err, navsto_matrix_init,
			       navsto_maxnu_init_t21);

      TRY_MALLOC( x, 1, struct vector, navsto_maxnu_init_t21);
      err=navsto_MG_init_assem_t21( m1, 0, &K, &rhs_stokes, &rhs,
				    x, &msh2, NULL, 0 );
      FUNCTION_FAILURE_HANDLE( err, navsto_MG_init_assem_t21,
			       navsto_maxnu_init_t21);

      if (x_old!=NULL)
	{
	  vector_free(x);
	  free(x);
	  x=x_old;
	}

      K.innereps=1e-2;

      K.innercount[0]=0;
      K.innercount[1]=0;
      K.innercount[2]=0;
      K.innercount[3]=0;

      K.innersteps[0]=-1;
      K.innersteps[1]=-1;
      K.innersteps[2]=-1;
      K.innersteps[3]=-1;

      K.mg->vcycles=1;
      K.mg->smooths=1;
      K.mg->stop_eps=0.0;


      err=navsto_MG_C_assem_t21( m1, 0, &K, &rhs, &maxPe, 
				 &rhs_stokes, x, &msh2, NULL, 0,
				 linmeth);
      FUNCTION_FAILURE_HANDLE( err, navsto_MG_C_assem_t21,
			       navsto_maxnu_init_t21);  
      err=GMRES( 100, 100, 2, 0.0, 1e-4, 1, x, &resi, &iter,
		 navsto_matrix_tim_vec, navsto_projector_w_precon,
		 &K, &rhs, NULL ); /* */
      FUNCTION_FAILURE_HANDLE( err, GMRES, navsto_maxnu_init_t21);  
      err=navsto_MG_C_assem_t21( m1, 0, &K, &rhs, &maxPe, 
				 &rhs_stokes, x, &msh2, NULL, 0,
				 linmeth );
      FUNCTION_FAILURE_HANDLE( err, navsto_MG_C_assem_t21,
			       navsto_maxnu_init_t21);  
  
      nu=maxPe*2*nu;
      /* printf("init %d maxPe=%8.1e    ==> nu0_max=%8.1e\n", (int) i,
	 maxPe, nu); /* */

      x_old=x;

      mesh_free(&msh2);
      navsto_matrix_free(&K);
      vector_free(&rhs);
      vector_free(&rhs_stokes);
    }
  vector_free(x_old);
  free(x_old);

  m1->para[MC2XPANUPO]=nu_keep;

  *nu_max=nu;

  return SUCCESS;
}





/*FUNCTION*/
int navsto_velssorter_init_t21(struct mesh *m,  struct multilvl *ml, 
			       struct mgdata *mg
/* initialises the sorterl part of the mgdata struct

   Input:  m       - the mesh, needs to be the finest level of the
                     hierarchy 
           ml      - multilevel data struct
	   
   In/Out: mg      - multigrid data struct, has to be initialised
                     already sorterl part is set here

   Return: SUCCESS - success
           FAIL    - failure, see error message, output will not be
                     valid
*/
		     ){
  int err;
 
  /* allocate memory */
  TRY_MALLOC( (*mg).sorterl, (*ml).nlevl[0], FIDX,
	      navsto_velssorter_init_t21);
  
  /* call routine to do the sort by x (dir1=0) */
  err=navsto_dofsorter_t21(m,  ml,        0, (*mg).sorterl);
  FUNCTION_FAILURE_HANDLE( err, navsto_dofsorter_t21,
			   navsto_velssorter_init_t21); 

  return SUCCESS;
}
  

/*FUNCTION*/
int navsto_dofsorter_t21(struct mesh *m,  struct multilvl *ml, 
			 FIDX dir1,
			 FIDX *sortperm
/* initialises the defines a permutation on of the dofs on each level
   of the mesh that sorts the nodes according to their coordinates in
   the mesh, first by spacial direction dir1 (e.g. dir1==0, sort by x
   first), then by the remaining

   Input:  m       - the mesh, needs to be the finest level of the
                     hierarchy 
           ml      - multilevel data struct, used to define the
                     multilevel sorter array
           dir1    - first direction to be sorted by, i.e. dir1==0:
                     sort by increasing x coordinates, dir1==1: sort
                     by increasing y coordinate
	   
   Out:    sortperm- integer array, size ml.nlevl[0], has to be
                     allocated by the calling routine, defines
                     permutation that sorts the dofs in the desired
                     order 

   Return: SUCCESS - success
           FAIL    - failure, see error message, output will not be
                     valid
*/
		     ){
  FIDX i, j, lvl;

  FIDX dim, vx_nr, vx_w, dir2;

  double this_x;
  FIDX this_x_start;

  struct intdouble *sorter;

  /****************   init ******************************************/
  dim   = (*ml).dim;
  vx_nr = (*m).vx_nr;
  vx_w  = (*m).vx_w;

  if ((dir1!=0)&&(dir1!=1))
    {
      fprintf(stderr,"navsto_dofsorter_t21: dir1 has invalid value\n");
      return FAIL;
    }

  /* define the second sort direction */
  dir2 = (dir1+1)%2;

  /* create the sorter */
  /* initialise a intdouble vector to be used to sort the nodes */
  TRY_MALLOC( sorter, vx_nr, struct intdouble,
	      navsto_dofsorter_t21);
  for (i=0; i<vx_nr; i++)
    {
      /* first we use the dir1 component as data */
      sorter[i].i = i; /* store the node number */
      sorter[i].d = (*m).vertex[i*vx_w+MCT2VXSTRT+dir1];
    }
  /* sort by the dir1 coordinate */
  qsort( sorter, vx_nr, sizeof(struct intdouble), comp_intdouble_d);

  /* now for each set of equal dir1-s sor them by dir2 */
  this_x       = sorter[0].d;
  this_x_start = 0;
  for (i=0; i<vx_nr; i++)
    {
      FIDX this_node;
      if (this_x != sorter[i].d)
	{
	  /* new x_dir1 is starting, so sort the old one first by x_dir2 */
	  qsort( &sorter[this_x_start], i-this_x_start,
		 sizeof(struct intdouble), comp_intdouble_d);

	  /* mark the start of a new x_dir1 */
	  this_x       = sorter[i].d;
	  this_x_start = i;
	}

      /* now make the y component the data */
      this_node = sorter[i].i;
      sorter[i].d = (*m).vertex[this_node*vx_w+MCT2VXSTRT+dir2];
    }
  /* sort the last set of equal x_dir1-s by x_dir2 */
  qsort( &sorter[this_x_start], vx_nr-this_x_start,
	 sizeof(struct intdouble), comp_intdouble_d);

  /* done sorting finest level */
  /* now the sorter sorts the nodes by (x,y), build the sortperm from
     that */ 

  /* now init the coarser levels */
  for (lvl=(*ml).lmax; lvl>=0; lvl-- )
    {
      FIDX lvl_sorted = 0;
      FIDX lvl_base   = (*ml).nlevl[lvl+1];

      for (i=0; i<vx_nr; i++)
	{
	  FIDX this_node;

	  this_node = sorter[i].i;
	  
	  for (j=0; j<dim; j++)
	    {
	      FIDX dofL, dof_lvl;

	      dofL = this_node + j*vx_nr;
	      
	      MLVLFINDENTRY( dof_lvl, dofL, lvl, (*ml) );
	      if (dof_lvl>=0)
		{
		  sortperm[lvl_sorted + lvl_base] =
		    dof_lvl - lvl_base;
		  lvl_sorted++;
		}
	    } /* end loop dim */

	} /* end loop vx */
      
      if (lvl_sorted != (*ml).nlevl[lvl]-lvl_base)
	{
	  fprintf(stderr,"navsto_dofsorter_t21: "
		  "lvl_sorted counter does not add up to expected value\n");
	  return FAIL;
	}

    } /* end loop lvl */

  /* free local data */
  free(sorter);

  return SUCCESS;
}
  















/**********************************************************************
***********************************************************************
***                                                                 ***
***                                                                 ***
***       new versions which do SDFEM                               ***
***                                                                 ***
***                                                                 ***
***********************************************************************
**********************************************************************/






/*FUNCTION*/
int navsto_assem_SDFEM_t21(struct navsto_matrix *K, struct vector *rhs,
			   struct vector *u_old, struct mesh *m,
			   FIDX lvl 
/* performs the assembly of the stiffness matrix K and the right hand
   side vector rhs which result from linearised SDFEM (streamline
   diffusion finite element method) discretisation of the Navier
   Stokes equation on the T2 mesh m, such that

           K x = rhs

   defines the new iterate (approximate) solution x=(u_x, u_y, p) of
   the Navier Stokes equation

         -nu Laplace(u) + u * grad u + grad p = f
                                        div u = 0

   SDFEM is described in L. Tobiska and R. Verfuerth, Analysis of a
   Streamline Diffusion Finite Element Method for The Stokes and
   Navier-Stokes equations, SIAM J. Numer. Anal., Vol. 33, No. 1,
   pp. 107--127, 1996

   with boundary conditions as given in the mesh m    
   (p2 triangles)

   Input:  u_old   - the previous iterate solution
	   m       - the mesh
	   lvl     - the level who's Fs is written
	   
   Out:    K       - stiffness matrix in specialised storage, the
                     structure K has to be initialised by
                     navsto_MG_init_assem_SDFEM_t21 once, and
                     afterwards it is updated in this routine

   Output: rhs     - righthand side vector, has to be initialised by
                     navsto_MG_init_assem_t21 once, and afterwards it
                     is just updated in this routine


   Return: SUCCESS - success
           FAIL    - failure, see error message, output will not be
                     valid
*/
		     ){
  int  err;
  FIDX el, d, i, j, k, l, r, s;

  FIDX dim, bas_n1, bas_n2, LDKloc, LDKloc2, vx_nr, pvx_nr, eg_nr, hi_nr,  
    el_w, vx_w, eg_w, fc_w, bd_w, hi_w, fu_w,
    bigN;
  FIDX subtypes[2];

  /* for FORTRAN calls: */
  char fTrans='T', fNoTrans='N';
  double dhelp, done=1.0, dzero=0.0;
  int fdim, fbas_n1, fbas_n2, fLDKloc;

  double alpha;
  double delta;

  int linmeth=2;  /* linmeth=0 Newton
                     linmeth=1 inexact Newton (z change ignored)
                     linmeth=2 Picard iteration (z and w change ignored) */
  int usePstab=1; /* usePstab=1  allow grad(q) as testfunction in the
		                 stabilisation terms
		     usePstab=0  don't ... */

  struct sparse *Klvl;     /* pointer to this levels Advection Diffusion
			      stiffness matrix */
  struct int_data iform; /* integration formula 2d   */
  double *Uk;            /* velocity vector at the integration point */
  double *gUk;           /* jacobian of Uk, gUk[i*dim+j] ... i-th
			    velocity component, j-th spatial dimension */ 
  double dUk;            /* div(Uk) */ 
  double *LapUk;         /* Laplace(Uk) */ 

  double *gPk;           /* grad of pressure (const on element) */

  double *Kloc;          /* element stiffness matrix */
  double *rhsloc;        /* element rhs */

  double *Jac, *Jacinv;  /* Jacobian of the element mapping and its
			    inverse */
  double *gphiloc1 ;     /* gradient of linear basis functions */
  double *gphiloc2;      /* gradient of quadratic basis functions */
  double *Lphiloc2;      /* Laplacian of quadratic basis functions */
  double detJac;         /* determinant of the Jacobian */
  FIDX   *dofs;          /* degrees of freedom to which Kloc corresponds */

  double *phi1, *gradp1;
  double *phi2, *gradp2, *hphi2;
  double nu, weight;

  double h, abs_uc, heg, h_max, huc, hucmax;
                         /* values used to determine h for the
			    stabilization term */  
  double xc[2];          /* the center point of the element */
  double *phic, *gphic, *hphic;
                         /* the basis function values at this point */

  /****************   init ******************************************/
  dim   = (*m).dim;
  vx_nr = (*m).vx_nr;
  eg_nr = (*m).eg_nr;
  hi_nr = (*m).hi_nr;
  vx_w  = (*m).vx_w;
  el_w  = (*m).el_w;
  eg_w  = (*m).eg_w;
  fc_w  = (*m).fc_w;
  bd_w  = (*m).bd_w;
  hi_w  = (*m).hi_w;
  fu_w  = (*m).fu_w;

  nu= (*m).para[MC2XPANUPO];

  /* stabelisation paramers alpha and delta, from Gelhard,Lube,... */
  alpha=1.0/(nu*nu);
  delta=nu;

  /* get integration formula */
  subtypes[0]=1;
  subtypes[1]=2;

  /* degree=7 
     max degree of integrand = 6 (w*grad(u),z*grad(v))
     degree of detJac=0,
     so 7 is sufficient */
  err=cubature_bases( dim, 7, tria, 2, subtypes, &iform); 
  FUNCTION_FAILURE_HANDLE( err, cubature_bases, navsto_assem_SDFEM_t21);

  /* make phi2 and gradphi2 better accessible */
  phi1    = (iform.bases[0]->phi);
  gradp1  = (iform.bases[0]->gradphi);
  bas_n1  = (iform.bases[0]->num_basis);
  phi2    = (iform.bases[1]->phi);
  gradp2  = (iform.bases[1]->gradphi);
  hphi2   = (iform.bases[1]->hessphi);
  bas_n2  = (iform.bases[1]->num_basis);

  fbas_n1 = (int) bas_n1;
  fbas_n2 = (int) bas_n2;
  fdim    = (int) dim;

  xc[0]=1.0/3.0;  /* the center of gravity of the master element is
		     (1/3,1/3) */
  xc[1]=xc[0];
  /* evaluate the basis functions for this point */
  err=eval_basis( dim, tria, subtypes[1], 1, xc, &i, &phic, &gphic, &hphic);
  FUNCTION_FAILURE_HANDLE( err, eval_basis, navsto_assem_SDFEM_t21);
  /* free unneeded info */
  free(hphic);


  if (dim != 2)
    {
      /* cry */
      fprintf(stderr,
	      "navsto_assem_SDFEM_t21: dim !=2 not implemented (Jacinv)\n");
      return FAIL;
    }

  if ( (lvl < 0)||(lvl > (*K).lvlmax) )
    {
      /* cry */
      fprintf(stderr,
	      "navsto_assem_SDFEM_t21: invalid lvl\n");
      return FAIL;
    }

  pvx_nr=(*K).pvx_nr_lvl[lvl];

  bigN = dim*vx_nr + pvx_nr;
  if ( (*K).Fs[lvl].row_nr!=bigN )
    {
      /* cry */
      fprintf(stderr,
	      "navsto_assem_SDFEM_t21: size of Fs wrong???\n");
      return FAIL;
    }

  if ( (*K).Bs[0].row_nr!=0 )
    {
      /* cry */
      fprintf(stderr,
	      "navsto_assem_SDFEM_t21: wrong type of navsto-matrix?\n"
	      "                        (Bs[0].row_nr!=0)\n");
      return FAIL; 
    }
    

  LDKloc  = dim*bas_n2+bas_n1;
  LDKloc2 = LDKloc*LDKloc;
  fLDKloc = (int) LDKloc; /* should be no problem, only small */

  /* allocate memory for local data */
  TRY_MALLOC( Uk, dim, double, navsto_assem_SDFEM_t21);
  TRY_MALLOC( gUk, dim*dim, double, navsto_assem_SDFEM_t21);
  TRY_MALLOC( LapUk, dim, double, navsto_assem_SDFEM_t21);

  TRY_MALLOC( gPk, dim, double, navsto_assem_SDFEM_t21);

  TRY_MALLOC( Kloc, LDKloc2, double, navsto_assem_SDFEM_t21);
  TRY_MALLOC( rhsloc, LDKloc, double, navsto_assem_SDFEM_t21);

  TRY_MALLOC( Jac, dim*dim, double, navsto_assem_SDFEM_t21);
  TRY_MALLOC( Jacinv, dim*dim, double, navsto_assem_SDFEM_t21);

  TRY_MALLOC( gphiloc1, dim*bas_n1, double, navsto_assem_SDFEM_t21);
  TRY_MALLOC( gphiloc2, dim*bas_n2, double, navsto_assem_SDFEM_t21);
  TRY_MALLOC( Lphiloc2, bas_n2, double, navsto_assem_SDFEM_t21);

  TRY_MALLOC( dofs, LDKloc, FIDX, navsto_assem_SDFEM_t21);

  /* set Klvl as pointer to the right level's Fs */
  Klvl = &(*K).Fs[lvl];

  /* clear Fs */
  sparse_empty(Klvl);

  /* initialise rhs */
  for (i=0; i<bigN; i++)
    {
      (*rhs).V[i]=0.0;
    }

  /* loop over all elements */
  for (el=0; el<(*m).el_nr; el++)
    {
      /* determine the h for this element */
      /* first set Uk=uc, the velocity vector at the center of the
	 element */
      for (j=0; j<dim; j++)
	{
	  Uk[j]=0.0;
	  for (i=0; i<bas_n2; i++)
	    Uk[j]+=phic[i]
	      *(*u_old).V[(*m).elem[el*el_w+MCT2ELNOD1+i]+j*vx_nr];
	}
      abs_uc=0;
      for (j=0; j<dim; j++)
	abs_uc+=Uk[j]*Uk[j];
      abs_uc=sqrt(abs_uc);

      /* now get the length of the edge which coincides best with the
	 direction of uc */
      hucmax=-1.0;
      h_max=-1.0;
      for (i=0; i<bas_n1; i++)
	for (j=i+1; j<bas_n1; j++)
	  {
	    FIDX nodei, nodej;
	    nodei=(*m).elem[el*el_w+MCT2ELNOD1+i];
	    nodej=(*m).elem[el*el_w+MCT2ELNOD1+j];
	    huc=0.0;
	    for(d=0; d<dim; d++)
	      huc+=Uk[d]*( (*m).vertex[nodei*vx_w+MCT2VXSTRT+d]
			   -(*m).vertex[nodej*vx_w+MCT2VXSTRT+d] );

	    heg=0.0; /* h_edge */
	    for(d=0; d<dim; d++)
	      heg+=( (*m).vertex[nodei*vx_w+MCT2VXSTRT+d]
		     -(*m).vertex[nodej*vx_w+MCT2VXSTRT+d] )
		*( (*m).vertex[nodei*vx_w+MCT2VXSTRT+d]
		   -(*m).vertex[nodej*vx_w+MCT2VXSTRT+d] );
	    heg=sqrt(heg);

	    if (heg>h_max) h_max=heg; /* max(h_edge) */

	    huc=fabs(huc);
	    if (huc>hucmax)         /* max(h_edge) in direction of u */
	      {
		hucmax=huc;
		h=heg;
	      }
	  }

      h = h_max;
      /* printf("h = %e   nu= %e \n\n", h, nu); */

      /* collect the dofs */
      for (d=0; d<dim; d++)
	for (i=0; i<bas_n2; i++)
	  {
	    dofs[d*bas_n2+i]=(*m).elem[el*el_w+MCT2ELNOD1+i]+d*vx_nr;
	  }
      for (i=0; i<bas_n1; i++)
	{
	  dofs[dim*bas_n2+i]=(*K).pdof[(*m).elem[el*el_w+MCT2ELNOD1+i]]
	                     +d*vx_nr;
	}

      
      /* set Kloc to zero */
      for (i=0; i<LDKloc2; i++) 
	{
	  Kloc[i]=0.0;
	}

      /* set rhsloc to zero */
      for (i=0; i<LDKloc; i++) 
	{
	  rhsloc[i]=0.0;
	}


      /* compute the Jacobian on this element (is canstant as we use
	 linear shape functions only (for now) ) */
      /* Jac=0 */
      for (i=0; i<dim*dim; i++)
	Jac[i]=0.0;
      
      /* Jac = sum_{i=nodes} vertex(i)*gradphi1_i^T */
      for (i=0;i<bas_n1; i++)
	{
	  for (j=0; j<dim; j++)
	    {
	      for (r=0; r<dim; r++)
		{
		  Jac[j*dim+r]+= 
		    (*m).vertex[(*m).elem[el*el_w+MCT2ELNOD1+i]*vx_w
				+MCT2VXSTRT+j]
		    * gradp1[0*bas_n1*dim +i*dim +r];
		}
	    }
	}

      /* printf("Jac= %e  %e \n     %e  %e \n", Jac[0], Jac[2],
	 Jac[1], Jac[3]); /* */

      /* get detJac */
      detJac=Jac[0]*Jac[3]-Jac[2]*Jac[1];
	  
      /* printf("el=%3d, k=%2d, detJac= %f\n", el, k, detJac); /* */
	  
      /* get Jacinv (here direct) */
      /* Jacinv[j*dim+i] = d xhat_j /d x_i */
      Jacinv[0]=1.0/detJac*Jac[3];
      Jacinv[1]=-1.0/detJac*Jac[1];
      Jacinv[2]=-1.0/detJac*Jac[2];
      Jacinv[3]=1.0/detJac*Jac[0];

      /* printf("Jacinv= %e  %e \n        %e  %e \n", Jacinv[0],
	 Jacinv[2], Jacinv[1], Jacinv[3]); /* */

      /* gphiloc1= Jacinv * gradphi1[k,:,:] 
	 (=real world gradient T1, also constant)
      */
      dgemm_( &fNoTrans, &fNoTrans, &fdim, &fbas_n1, &fdim,
	      &done, Jacinv, &fdim, &(gradp1[0*bas_n1*dim]), &fdim,
	      &dzero, gphiloc1, &fdim );

      /* compute gPk */
      for (d=0; d<dim; d++)
	{
	  gPk[d]=0.0;
	  for (j=0; j<bas_n1; j++)
	    {
	      gPk[d] += gphiloc1[j*dim+d]
		*(*u_old).V[dofs[dim*bas_n2+j]];
	    }
	}


      /* loop over all integration points */
      for (k=0; k<iform.num_points; k++)
	{
	  /* gphiloc2= Jacinv * gradphi2[k,:,:]
	     (=real world gradient T2)
	  */
	  dgemm_( &fNoTrans, &fNoTrans, &fdim, &fbas_n2, &fdim,
		  &done, Jacinv, &fdim, &(gradp2[k*bas_n2*dim]), &fdim,
		  &dzero, gphiloc2, &fdim );

	  /* Lphiloc2[i] = sum_d=dim (d phi_i/d x_d)
                = sum_d ( sum_j 
		   ( sum_l hessphi2[k,i,j,l] * Jacinv[l,d] )
		    * Jacinv[j,d]  
		    + some term only important if shapefunctions not linear
		        ) */
	  for (i=0; i<bas_n2; i++)
	    {
	      Lphiloc2[i]=0.0;
	      for (d=0; d<dim; d++)
		{
		  double d2_dxd2;
		  d2_dxd2 = 0.0;
		  for (j=0; j<dim; j++)
		    {
		      double djpart;
		      djpart=0.0;
		      for (l=0; l<dim; l++)
			{
			  djpart += hphi2[k*bas_n2*dim*dim +i*dim*dim 
					  +j*dim +l] * Jacinv[l*dim+d];
			}
		      d2_dxd2 += djpart*Jacinv[j*dim+d];
		    }
		  Lphiloc2[i] += d2_dxd2;
		}
	    }
	  
	  /*********** test *************/
	  /* printf("\ntest: \nxhat=\n %e  %e\n", iform.points[0],
	     iform.points[1]);
	     printf("\nLphi=\n");
	     for (i=0; i<bas_n2; i++) printf("%e   ", Lphiloc2[i]);
	     printf("\ntest end\n"); exit(1); /* */
	  /********** end test **********/

	  /* compute Uk */
	  for (d=0; d<dim; d++)
	    {
	      Uk[d]=0.0;
	      for (j=0; j<bas_n2; j++)
		{
		  Uk[d] += phi2[k*bas_n2+j] *(*u_old).V[dofs[d*bas_n2+j]];
		  }
	      }

	  /* compute gUk */
	  for (d=0; d<dim; d++)
	    for (i=0; i<dim; i++)
	      {
		gUk[d*dim+i]=0.0;
		for (j=0; j<bas_n2; j++)
		  {
		    gUk[d*dim+i] += gphiloc2[j*dim+i]
		      *(*u_old).V[dofs[d*bas_n2+j]];
		  }
	      }

	  /* compute dUk */
	  dUk=0.0;
	  for (d=0; d<dim; d++)
	    {
	      dUk += gUk[d*dim+d];
	    }

	  /* compute LapUk */
	  for (d=0; d<dim; d++)
	    {
	      LapUk[d]=0.0;
	      for (j=0; j<bas_n2; j++)
		{
		  LapUk[d] += Lphiloc2[j]*(*u_old).V[dofs[d*bas_n2+j]];
		}
	    }


	  /* the real wolrd weight of this integration point */
	  weight=fabs(detJac) * iform.weights[k];

	  /* the viscous term a(.,.) */
	  dhelp=weight*nu;
	  /* elA += |detJac|*weigth[k] * nu* gphiloc2^T*gphiloc2 */
	  for (d=0; d<dim; d++)
	    {
	      dgemm_( &fTrans, &fNoTrans, &fbas_n2, &fbas_n2, &fdim,
		      &dhelp, gphiloc2, &fdim, gphiloc2, &fdim,
		      &done, &Kloc[d*bas_n2*LDKloc+d*bas_n2], &fLDKloc );
	    }

	  /* the pressure terms b(.,.) */
	  for (d=0; d<dim; d++)        /* the dimension (div term) */
	    for (j=0; j<bas_n1; j++)   /* the p-dofs */
	      for (i=0; i<bas_n2; i++) /* the u-dofs */
		{
		  /* b(p,v) */
		  /* Kloc(i,j)+=
		     -|detJac|*weigth[k] * phi1_j * gradphi2(i,d) */
		  Kloc[(j+dim*bas_n2)*LDKloc+i+d*bas_n2] += 
		    - weight * phi1[k*bas_n1+j] * gphiloc2[i*dim+d];
		  
		  /* b(q,u) */
		  /* Kloc(j,i)+=
		     |detJac|*weigth[k] * phi1_j * gradphi2(i,d) */
		  Kloc[(i+d*bas_n2)*LDKloc+j+dim*bas_n2] += 
		    weight * phi1[k*bas_n1+j] * gphiloc2[i*dim+d];
		}

	  /* the convective terms c(w,u,v) */
	  for (d=0; d<dim; d++)  /* dimension 1 (u_d, v_d) */
	    for (r=0; r<dim; r++) /* dimension 2 (w_r, d/dx_r) */
	      for (i=0; i<bas_n2; i++)
		{
		  for (j=0; j<bas_n2; j++)
		    {
		      /* c(u_old,phi2,phi2) */
		      Kloc[(j+d*bas_n2)*LDKloc+i+d*bas_n2] += 
			weight * Uk[r]
			* gphiloc2[j*dim+r]*phi2[k*bas_n2+i];
		      
		      if (linmeth<2)
			{
			  /* c(phi2,u_old,phi2) */
			  Kloc[(j+r*bas_n2)*LDKloc+i+d*bas_n2] += 
			    weight * phi2[k*bas_n2+j]
			    * gUk[d*dim+r]*phi2[k*bas_n2+i];
			}
		    }
		  if (linmeth<2)
		    {
		      /* c(u_old,u_old,phi2) (rhs) */
		      rhsloc[i+d*bas_n2] += 
			weight * Uk[r]
			* gUk[d*dim+r]*phi2[k*bas_n2+i];
		    }
		}

	  /* the stabelisation terms */
	  /* divergence term d2(u,v) */
	  dhelp=weight * nu*alpha*delta;
	  for (d=0; d<dim; d++)  /* dimension 1 (v_d) */
	    for (r=0; r<dim; r++) /* dimension 2 (u_r) */
	      for (i=0; i<bas_n2; i++)
		for (j=0; j<bas_n2; j++)
		  {
		    Kloc[(j+r*bas_n2)*LDKloc+i+d*bas_n2] += 
		      dhelp * gphiloc2[j*dim+r] *gphiloc2[i*dim+d];
		  }

	  /* the main term d1(w,z,u,p,v,q) */
	  dhelp=weight *1.0/nu*delta*h*h;
	  for (d=0; d<dim; d++)  /* dimension 1 (u_d,v_d) */
	    { 
	      double Uk_dot_gUk_d; /* =  Uk*gUk (d-th row) */
	      double prod_part1;   /* = -nu LapUk + Uk*gUk + gPk */

	      Uk_dot_gUk_d = 0.0;
	      for (s=0; s<dim; s++) Uk_dot_gUk_d += Uk[s]*gUk[d*dim+s];

	      prod_part1=-nu*LapUk[d] + Uk_dot_gUk_d + gPk[d];

	      for (i=0; i<bas_n2; i++)
		{
		  double uk_dot_gphiloc2i;
		  uk_dot_gphiloc2i = 0.0;
		  for (s=0; s<dim; s++)
		    uk_dot_gphiloc2i += Uk[s]*gphiloc2[i*dim+s];

		  /* velocity-velocity part of the matrix+rhs */
		  for (j=0; j<bas_n2; j++)
		    {
		      double uk_dot_gphiloc2j;
		      uk_dot_gphiloc2j = 0.0;
		      for (s=0; s<dim; s++)
			uk_dot_gphiloc2j += Uk[s]*gphiloc2[j*dim+s];

		      if (linmeth<2)
			{
			  /**********************************************/
			  /* d(phi2,u_old,u_old,p_old,phi2,0)  w-change */
			  /* Kloc part */
			  for (s=0; s<dim; s++) /* dimension 2 (w_d) */
			    {
			      Kloc[(j+s*bas_n2)*LDKloc+i+d*bas_n2] += 
				dhelp * phi2[k*bas_n2+j] *gUk[d*dim+s] 
				* uk_dot_gphiloc2i;
			    }
			}


		      if (linmeth<1)
			{
			  /*********************************************/
			  /* d(u_old,phi2,u_old,p_old,phi2,0) z-change */
			  /* only Kloc part */
			  for (s=0; s<dim; s++) /* dimension 2 (z_d) */
			    {
			      Kloc[(j+s*bas_n2)*LDKloc+i+d*bas_n2] += 
				dhelp * prod_part1 
				* phi2[k*bas_n2+j] *gphiloc2[i*dim+s];
			    }
			}


		      /*********************************************/
		      /* d(u_old,u_old,phi2,p_old,phi2,0) u-change */
		      /* Kloc part */
		      Kloc[(j+d*bas_n2)*LDKloc+i+d*bas_n2] += 
			dhelp 
			* ( -nu*Lphiloc2[j] +uk_dot_gphiloc2j )
			* uk_dot_gphiloc2i;
		    } /* end loop j velocity dofs */
		  
		  if (linmeth<2)
		    {
		      /*********************************************/
		      /* d(phi2,u_old,u_old,p_old,phi2,0) w-change */
		      /* rhs part */
		      rhsloc[i+d*bas_n2] += 
			-dhelp*( -nu*LapUk[d] + gPk[d])*uk_dot_gphiloc2i;
		    }

		  /*********************************************/
		  /* d(u_old,u_old,phi2,p_old,phi2,0) u-change */
		  /* rhs part */
		  rhsloc[i+d*bas_n2] += 
		    -dhelp*  gPk[d] * uk_dot_gphiloc2i;


		  /* velocity-pressure, pressure-velocity parts of the
		     matrix+rhs */ 
		  for (j=0; j<bas_n1; j++)
		    {
		      if ((linmeth<2)&&(usePstab))
			{
			  /*********************************************/
			  /* d(phi2,u_old,u_old,p_old,0,phi1) w-change */
			  for (s=0; s<dim; s++) /* dimension 2 (z_d) */
			    {
			      Kloc[(i+s*bas_n2)*LDKloc+j+dim*bas_n2] += 
				dhelp *(phi2[k*bas_n2+i] *gUk[d*dim+s] )
				* gphiloc1[j*dim+d];
			    }
			}

		      /*********************************************/
		      /* d(u_old,phi2,u_old,p_old,0,phi1) z-change */
		      /* lhs contribution is zero, but have pure rhs
			 contribution */

		      if (usePstab)
			{
			  /*********************************************/
			  /* d(u_old,u_old,phi2,p_old,0,phi1) u-change */
			  /* Kloc part */
			  Kloc[(i+d*bas_n2)*LDKloc+j+dim*bas_n2] += 
			    dhelp 
			    * ( -nu*Lphiloc2[i] +uk_dot_gphiloc2i )
			    * gphiloc1[j*dim+d];
			}

		      /*********************************************/
		      /* d(u_old,u_old,u_old,phi1,phi2,0) p-change */
		      Kloc[(j+dim*bas_n2)*LDKloc+i+d*bas_n2] += 
			dhelp * gphiloc1[j*dim+d] * uk_dot_gphiloc2i;
		    }

		  /*********************************************/
		  /* d(u_old,u_old,u_old,phi1,phi2,0) p-change */
		  rhsloc[i+d*bas_n2] += 
		    -dhelp *( -nu*LapUk[d] + Uk_dot_gUk_d)
		    * uk_dot_gphiloc2i;

		  /*********************************************/
		  /* 3*d(u_old,u_old,u_old,p_old,phi2,0)  rhs */
		  rhsloc[i+d*bas_n2] += 
		    (3.0-linmeth)*dhelp*( -nu*LapUk[d] + Uk_dot_gUk_d 
					  + gPk[d]) * uk_dot_gphiloc2i;

		} /* end loop velocity dofs i */
	      /* pressure-pressure part of the matrix+rhs */ 
	      for (i=0; i<bas_n1; i++)
		{
		  for (j=0; j<bas_n1; j++)
		    {
		      if (usePstab)
			{
			  /*********************************************/
			  /* d(u_old,u_old,u_old,phi1,0,phi1) p-change */
			  Kloc[(j+dim*bas_n2)*LDKloc+i+dim*bas_n2] += 
			    dhelp * gphiloc1[j*dim+d] * gphiloc1[i*dim+d];
			}
		    }


		  if ((linmeth<2)&&(usePstab))
		    {
		      /*********************************************/
		      /* d(phi2,u_old,u_old,p_old,0,phi1) w-change */
		      /* rhs part */
		      rhsloc[i+dim*bas_n2] += 
			-dhelp*( -nu*LapUk[d] + gPk[d])* gphiloc1[i*dim+d];
		    }

		  if ((linmeth<1)&&(usePstab))
		    {
		      /*********************************************/
		      /* d(u_old,phi2,u_old,p_old,0,phi1) z-change */
		      /* lhs contribution is zero, but have pure rhs
			 contribution */
		      rhsloc[i+dim*bas_n2] += 
			-dhelp*( -nu*LapUk[d] + Uk_dot_gUk_d + gPk[d])
			* gphiloc1[i*dim+d];
		    }

		  if (usePstab)
		    {
		      /*********************************************/
		      /* d(u_old,u_old,phi2,p_old,0,phi1) u-change */
		      /* rhs part */
		      rhsloc[i+dim*bas_n2] += 
			-dhelp*( gPk[d])* gphiloc1[i*dim+d];
		    }
		  
		  if (usePstab)
		    {
		      /*********************************************/
		      /* d(u_old,u_old,u_old,phi1,0,phi1) p-change */
		      /* rhs part */
		      rhsloc[i+dim*bas_n2] += 
			-dhelp*( -nu*LapUk[d] + Uk_dot_gUk_d )
			* gphiloc1[i*dim+d];
		    }

		  if (usePstab)
		    {
		      /*********************************************/
		      /* 3*d(u_old,u_old,u_old,p_old,0,phi1)  rhs */
		      rhsloc[i+dim*bas_n2] += 
			(3.0-linmeth)*dhelp*( -nu*LapUk[d] + Uk_dot_gUk_d 
					      + gPk[d]) * gphiloc1[i*dim+d];
		    }

		} /* end loop pressure dofs i */

	    } /* end loop over d (1st dimension, u_d,v_d) */

	} /* end loop over integration points */

      /***************************************************************
       * Kloc and rhsloc are ready, add them to the global matrix    *
       ***************************************************************/

      /* add Kloc to Klvl */
      err=sparse_add_local(Klvl, NoTrans, LDKloc, dofs, LDKloc, dofs,
			   Kloc, LDKloc );
      FUNCTION_FAILURE_HANDLE( err, sparse_add_local,
			       navsto_assem_SDFEM_t21);

      /* add rhsloc to rhs */
      for (i=0; i<LDKloc; i++)
	{
	  (*rhs).V[dofs[i]]+= rhsloc[i];
	}
    } /* end loop over all elements */


  /* correct the rhs (project it for the boundary conditions) */
  /* project the righthand side */
  for (i=0; i<(*K).bn_nr; i++)
    {
      for (j=0; j<dim; j++)
	{
	  (*rhs).V[(*K).nodes[i]+j*vx_nr]=0.0;
	}
    }

  /* free local data */
  free(dofs);

  free(Lphiloc2);
  free(gphiloc2);
  free(gphiloc1);

  free(Jacinv);
  free(Jac);

  free(rhsloc);
  free(Kloc);

  free(gPk);

  free(LapUk);
  free(gUk);
  free(Uk);

  free(phic);
  free(gphic);
  free_intdata (&iform);

  return SUCCESS;
}
  

/*FUNCTION*/
int navsto_MG_assem_SDFEM_t21(struct mesh *m1, FIDX lvl,
			      struct navsto_matrix *K, 
			      struct vector *rhs,
			      struct vector *u0,
			      struct mesh *m2, double *vx_new, FIDX vx_max
/* repeatetly calls navsto_assem_SDFEM_t21 to perform the assembly of the
   changing parts of the stiffness matrix K and all its coarse
   grid versions and the right hand side vector rhs which result from
   linearised finite element discretisation of the Navier Stokes
   equation on the T2 mesh m2, which results from refinement of the T1
   mesh m1, such that  

           K x = rhs

   defines the new iterate (approximate) solution x=(u_x, u_y, p) of
   the Navier Stokes equation

   Input:  m1      - the coarse mesh (T1)
           lvl     - the current refinement level
	   u0      - the previous iterate solution
	   vx_new  - overriding set of node positions for T2 meshes
	   vx_max  - maximum number of nodes for use of vx_max
           
   In/Out: K       - stiffness matrix in specialised storage, the
                     structure K has to be initialised by
                     navsto_init_MG_assem_SDFEM_t21 once, and
                     afterwards it is just updated (the Fs part, which
                     is the only part that changes between iterations)
                     in this routine 

   Output: m2      - the fine mesh (T2)
	   rhs     - righthand side vector, has to be initialised by
                     navsto_init_assem_t21 once, and afterwards it is
                     just updated in this routine


   Return: SUCCESS - success
           FAIL    - failure, see error message, output will not be
                     valid
*/
				){
  int  err;
  FIDX i, d, dim, lvx_nr, tl;

  double *vx_tmp=NULL;

  dim = m1->dim;

  /* protect vx_new from being freed */
  if (vx_new== m2->vertex) m2->vertex=NULL;
  mesh_free( m2);

  /* restart with coarse t2 mesh */
  err=mesh_t1_to_t2(m1, m2);
  FUNCTION_FAILURE_HANDLE( err, mesh_t1_to_t2,
			   navsto_MG_assem_SDFEM_t21);

  /* override node positions */
  if (vx_new != NULL)
    {
      vx_tmp     = m2->vertex;
      if (m2->vx_nr <= vx_max)
	m2->vertex = vx_new;
      else
	{ 
	  fprintf(stderr, "navsto_MG_assem_SDFEM_t21: "
		  "vx_max insufficient!\n");
	  return FAIL;
	}
    }
  
  for (tl=0; tl<=lvl; tl++)
    {
      lvx_nr= (*m2).vx_nr;

      if(tl>0)   /* coarse meshes 2nd order */
	{
	  /* protect vx_new */
	  if (vx_new != NULL) m2->vertex = vx_tmp;

	  err=mesh_refine_uniform_t2(m2);
	  FUNCTION_FAILURE_HANDLE( err, mesh_refine_uniform_t2,
				   navsto_MG_assem_SDFEM_t21);

	  /* override node positions */
	  if (vx_new != NULL)
	    {
	      vx_tmp     = m2->vertex;
	      if (m2->vx_nr <= vx_max)
		m2->vertex = vx_new;
	      else
		{ 
		  fprintf(stderr, "navsto_MG_assem_SDFEM_t21: "
			  "vx_max insufficient!\n");
		  return FAIL;
		}
	    }
	}

      /* do the assembly */
      err=navsto_assem_SDFEM_t21( K, rhs, u0, m2, tl );
      FUNCTION_FAILURE_HANDLE( err, navsto_assem_SDFEM_t21,
			       navsto_MG_assem_SDFEM_t21);

      if(tl==0)
	{
	  FIDX ndiri, *diris;
	  TRY_MALLOC( diris, (*m2).vx_nr*(*m2).dim, FIDX,
		      navsto_MG_assem_SDFEM_t21);
	  coarse_mat_free( &(*K).cmat);
	  ndiri=0;
	  for (i=0; i<(*K).bn_nr_lvl[tl]; i++)
	    {
	      FIDX node;
	      node=(*K).nodes_lvl[tl][i];
	      for (d=0; d<dim; d++)
		{
		  diris[ndiri]=node+d*lvx_nr;
		  ndiri++;
		}
	    }
	  /* set the coarse grid matrix */
	  err=coarse_mat_set( &(*K).Fs[tl], ndiri, diris, 2,
			      &(*K).cmat); 
	  FUNCTION_FAILURE_HANDLE( err, coarse_mat_set,
				   navsto_MG_assem_SDFEM_t21);
	  free(diris);
	}
	
    }

  /* release the temporary vertex positions */
  if (vx_new!=NULL) free(vx_tmp);

  return SUCCESS;
}



/*FUNCTION*/
int navsto_MG_init_assem_SDFEM_t21(struct mesh *m1, FIDX lvl,
				   struct navsto_matrix *K, 
				   struct vector *rhs,
				   struct vector *u0,
				   struct mesh *m2,
				   double *vx_new, FIDX vx_max
/* initialises the non-changing parts of the navsto_matrix struct K,
   the initial solution u0 and the right hand side vector rhs which
   result from linearised finite element discretisation of the Navier
   Stokes equation on the T2 mesh m2 (build from T1 mesh m1 by
   refinement), such that

           K x = rhs

   defines the new iterate (approximate) solution x=(u_x, u_y, p) of
   the Navier Stokes equation

         -nu Laplace(u) + u * grad u + grad p = f
                                        div u = 0

   with boundary conditions as given in the mesh m    
   (p2 triangles)

   The terms initialised are mainly projector data for the Dirichlet
   boundary conditions and for the zero mean pressure condition, which
   are identical to what they are in the Stokes case, thus we use the
   same routines to initialise them. However, initialisation of the
   remaining parts is performed here as well.

   Input:  m1      - the coarse mesh (T1)
           lvl     - (new) current level
	   vx_new  - overriding set of node positions for T2 meshes
	   vx_max  - maximum number of nodes for use of vx_max

   Output: K       - stiffness matrix in specialised storage, the
                     structure K is initialised here, memory
                     allocated and during successive iterations
                     navsto_MG_assem_SDFEM_t21 has to be used to
                     update the changing parts of the system
           rhs     - the righthand side of the linearised Navier
                     Stokes system, only the vector data structure
                     needs to be given, it is initialised here
	   u0      - a initial guess for the solution which fulfils
         	     the boundary conditions
	   m2      - the refined T2 mesh



   Return: SUCCESS - success
           FAIL    - failure, see error message, output will not be
                     valid
*/
				){
  int  err;
  FIDX i, tl;
  double *vx_tmp=NULL;

  /* protect vx_new from being freed */
  if (vx_new== m2->vertex) m2->vertex=NULL;
  mesh_free( m2);

  /* restart with coarse t2 mesh */
  err=mesh_t1_to_t2(m1, m2);
  FUNCTION_FAILURE_HANDLE( err, mesh_t1_to_t2,
			   navsto_MG_init_assem_SDFEM_t21);
  
  /* override node positions */
  if (vx_new != NULL)
    {
      vx_tmp     = m2->vertex;
      if (m2->vx_nr <= vx_max)
	m2->vertex = vx_new;
      else
	{ 
	  fprintf(stderr, "navsto_MG_init_assem_SDFEM_t21: "
		  "vx_max insufficient!\n");
	  return FAIL;
	}
    }

  for (tl=0; tl<=lvl; tl++)
    {
      if(tl>0)   /* coarse meshes 2nd order */
	{
	  /* protect vx_new */
	  if (vx_new != NULL) m2->vertex = vx_tmp;

	  err=mesh_refine_uniform_t2(m2);
	  FUNCTION_FAILURE_HANDLE( err, mesh_refine_uniform_t2,
				   navsto_MG_init_assem_SDFEM_t21);

	  /* override node positions */
	  if (vx_new != NULL)
	    {
	      vx_tmp     = m2->vertex;
	      if (m2->vx_nr <= vx_max)
		m2->vertex = vx_new;
	      else
		{ 
		  fprintf(stderr, "navsto_MG_init_assem_SDFEM_t21: "
			  "vx_max insufficient!\n");
		  return FAIL;
		}
	    }

	}



      if ( (*K).pdof_lvl[tl]==NULL )
	{
	  (*K).vx_nr_lvl[tl] = (*m2).vx_nr;

	  /* we never want to delete the old nodes, pdof and weight, 
	     so save them */ 
	  (*K).nodes  = NULL;
	  (*K).pdof   = NULL;
	  (*K).weight = NULL;

	  err=navsto_matrix_reinit( K, (*m2).vx_nr );
	  FUNCTION_FAILURE_HANDLE( err, navsto_matrix_reinit,
				   navsto_MG_init_assem_SDFEM_t21);

	  err=stokes_pdof_init_t21( m2, K);
	  FUNCTION_FAILURE_HANDLE( err, stokes_pdof_init__t21,
				   navsto_MG_init_assem_SDFEM_t21);

	  (*K).pdof_lvl[tl]   = (*K).pdof;
	  (*K).weight_lvl[tl] = (*K).weight;
	  (*K).pvx_nr_lvl[tl] = (*K).pvx_nr;


	  err=stokes_diri_bc_init_t2( m2, NULL, NULL, K);
	  FUNCTION_FAILURE_HANDLE( err, stokes_diri_bc_init_t2,
				   navsto_MG_init_assem_SDFEM_t21);

	  (*K).nodes_lvl[tl] = (*K).nodes;
	  (*K).bn_nr_lvl[tl] = (*K).bn_nr;

	  /* needs to go AFTER diri_init */
	  err=navsto_pdof_vels_init_t21( m2, tl, K);
	  FUNCTION_FAILURE_HANDLE( err, navsto_pdof_vels_init__t21,
				   navsto_MG_init_assem_SDFEM_t21);
	}

      if ( (*K).Fs[tl].row_nr==0)
	{
	  err=sparse_alloc( &(*K).Fs[tl], 
			    (*m2).dim * (*m2).vx_nr+ (*K).pvx_nr_lvl[tl],
			    KBLOCKDENSITY );
	  FUNCTION_FAILURE_HANDLE( err, sparse_alloc,
				   navsto_MG_init_assem_SDFEM_t21);
	}
    }


  (*K).msh= m2;
  (*K).vx_nr = (*m2).vx_nr;
  (*K).dim   = (*m2).dim;
  (*K).lvl   = lvl;

  err=vector_alloc( rhs, (*m2).dim * (*m2).vx_nr + (*K).pvx_nr);
  FUNCTION_FAILURE_HANDLE( err, vector_alloc, navsto_MG_init_assem_SDFEM_t21);
  for (i=0; i<(*rhs).len; i++)
    { (*rhs).V[i]=0.0; }

  err=vector_alloc( u0, (*K).dim * (*K).vx_nr + (*K).pvx_nr);
  FUNCTION_FAILURE_HANDLE( err, vector_alloc, navsto_MG_init_assem_SDFEM_t21);
  for (i=0; i<(*u0).len; i++)
    { (*u0).V[i]=0.0; }

  /* ensure the boundary conditions are satisfied by the initial
     solution */
  err=stokes_diri_bc_init_t2( m2, u0, rhs, K);
  FUNCTION_FAILURE_HANDLE( err, stokes_diri_bc_init_t2,
			   navsto_MG_init_assem_SDFEM_t21);

  /* init Bs[] to size zero */
  /* should be already by virtue of navsto_matrix_init */
  

  if ((*K).mld==NULL) 
    {
      TRY_MALLOC( (*K).mld, 1, struct multilvl, 
		  navsto_MG_init_assem_SDFEM_t21);
    }

  err=multilvl_init_t2( m2, (*m2).dim, (*K).mld);
  FUNCTION_FAILURE_HANDLE( err, multilvl_init_t2,
			   navsto_MG_init_assem_SDFEM_t21);

  if ((*K).ml1==NULL) 
    {
      TRY_MALLOC( (*K).ml1, 1, struct multilvl, 
		  navsto_MG_init_assem_SDFEM_t21);
    }

  err=multilvl_init_t2( m2, 1, (*K).ml1);
  FUNCTION_FAILURE_HANDLE( err, multilvl_init_t2,
			   navsto_MG_init_assem_SDFEM_t21);


  if ((*K).mg==NULL) 
    {
      TRY_MALLOC( (*K).mg, 1, struct mgdata, 
		  navsto_MG_init_assem_SDFEM_t21);
      mg_null_def( (*K).mg);
    }
  if ((*K).mg1==NULL) 
    {
      TRY_MALLOC( (*K).mg1, 1, struct mgdata, 
		  navsto_MG_init_assem_SDFEM_t21);
      mg_null_def( (*K).mg1);
    }


  err=mg_init_tx( NULL, m2, (*K).mld, (*K).mg, NULL );
  FUNCTION_FAILURE_HANDLE( err, mg_init_tx, navsto_MG_init_assem_SDFEM_t21);

  err=mg_init_tx( NULL, m2, (*K).ml1, (*K).mg1, NULL );
  FUNCTION_FAILURE_HANDLE( err, mg_init_tx, navsto_MG_init_assem_SDFEM_t21);


  /* release the temporary vertex positions */
  if (vx_new!=NULL) free(vx_tmp);


  return SUCCESS;
}





/*FUNCTION*/
int navsto_pdof_vels_init_t21(struct mesh *m, FIDX lvl,
			      struct navsto_matrix *K
/* initialises the pdof_vels part of the matrix struct

   Input:  m       - the mesh
	   lvl     - the level who's Fs is written
	   
   In/Out: K       - stiffness matrix in specialised storage, the
                     structure K has to be initialised by
                     stokes_pdof_init_t21 and stokes_diri_bc_init_t2
                     for the current level already, the pdof_vels[lvl]
                     part is set here 

   Return: SUCCESS - success
           FAIL    - failure, see error message, output will not be
                     valid
*/
		     ){
  FIDX el, i, j;

  FIDX dim, bas_n1, bas_n2, vx_nr, vx_w, pvx_nr, el_w;

  FIDX    *ldofs;         /* local degrees of freedom */
  FIDX    *lpdofs;        /* local pdofs */
  FIDX    *bc_marker;     /* marks each node to be diri-bc node (==0) or
			     not (==1) */
  double this_x;
  FIDX this_x_start;

  struct ilist** pdof_vels;
  double *vel_weight;
  struct intdouble *sorter;
  FIDX *nodes;
  FIDX *pdof;
  FIDX *pdof_sorter_reverse;

  /****************   init ******************************************/
  dim   = (*m).dim;
  vx_nr = (*m).vx_nr;
  vx_w  = (*m).vx_w;
  el_w  = (*m).el_w;

  /* number of nodes per element for the t21 mesh */
  bas_n1 = 3;
  bas_n2 = 6;

  if ((lvl<0)||(lvl>=(*K).lvlmax))
    {
      /* cry */
      fprintf(stderr,
	      "navsto_pdof_vels_init_t21: lvl out of range\n");
      return FAIL;
    }

  pvx_nr    = (*K).pvx_nr_lvl[lvl];

  if ((pvx_nr<=0)||( (*K).bn_nr_lvl[lvl]<=0 ))
    {
      /* cry */
      fprintf(stderr, "navsto_pdof_vels_init_t21: "
	      "pdofs or nodes not initialised?\n");
      return FAIL;
    }

  /* allocate memory for pdof_vels */
  TRY_MALLOC( (*K).pdof_vels[lvl], pvx_nr, struct ilist*, 
	      navsto_pdof_vels_init_t21);
  TRY_MALLOC( (*K).vel_weight[lvl], vx_nr, double, 
	      navsto_pdof_vels_init_t21);
  pdof_vels  = (*K).pdof_vels[lvl];
  vel_weight = (*K).vel_weight[lvl];
  nodes      = (*K).nodes_lvl[lvl];
  pdof       = (*K).pdof_lvl[lvl];

  /* allocate memory for local data */
  TRY_MALLOC( ldofs,  bas_n2, FIDX, navsto_pdof_vels_init_t21);
  TRY_MALLOC( lpdofs, bas_n1, FIDX, navsto_pdof_vels_init_t21);
  TRY_MALLOC( bc_marker, vx_nr, FIDX, navsto_pdof_vels_init_t21);

  /* initialise */
  for (i=0; i<pvx_nr; i++)
    pdof_vels[i]=NULL;

  for (i=0; i<vx_nr; i++)
    vel_weight[i]=0.0;
  

  /* set bc_marker: 1 if node is not bc-node, 0 if it is */
  for (i=0; i<vx_nr; i++)
    bc_marker[i]=1;
  for (i=0; i<(*K).bn_nr_lvl[lvl]; i++)
    bc_marker[nodes[i]]=0;
  


  /* loop over all elements */
  for (el=0; el<(*m).el_nr; el++)
    {
      /* collect the dofs */
      for (i=0; i<bas_n2; i++)
	{
	  ldofs[i]=(*m).elem[el*el_w+MCT2ELNOD1+i];
	}
      for (i=0; i<bas_n1; i++)
	{
	  lpdofs[i]=pdof[(*m).elem[el*el_w+MCT2ELNOD1+i]];
	}

      /* add the velocity nodes to each of the pdof lists */
#ifdef BOXSMOOTHER_LOC_NEUMANN
      /* local Neumann like problems, all vel-dofs of the element are
	 included */
      for (i=0; i<3; i++)
	{
	  FIDX loc_vel;

	  for (j=0; j<6; j++)
	    {
	      loc_vel=ldofs[j]; 
	      ilist_sorted_insert( &pdof_vels[lpdofs[i]], loc_vel);
	    }
	}
#else
      /* local Dirichlet problems, only the interior velocity
	 nodes of the p-dof support patch are included */
      for (i=0; i<3; i++)
	{
	  FIDX loc_vel;
	  FIDX is_bc_element, j;

	  is_bc_element = 0;

	  loc_vel=ldofs[i];         /* node itself */
	  if (bc_marker[loc_vel])
	    ilist_sorted_insert( &pdof_vels[lpdofs[i]], loc_vel);
	  else is_bc_element=1;

	  loc_vel=ldofs[i+3];       /* right midnode */
	  if (bc_marker[loc_vel])
	    ilist_sorted_insert( &pdof_vels[lpdofs[i]], loc_vel);
	  else is_bc_element=1;

	  loc_vel=ldofs[(i+2)%3+3]; /* left midnode */
	  if (bc_marker[loc_vel])
	    ilist_sorted_insert( &pdof_vels[lpdofs[i]], loc_vel);
	  else is_bc_element=1;

#ifdef BOXSMOOTHER_BC_ELEMENT_ALL_V
	  /* if it is a bc element, add all no-bc velocity nodes of
	     the  element */
	  if (is_bc_element) 
	    {
	      for (j=0; j<6; j++)
		{
		  loc_vel=ldofs[j]; 
		  if (bc_marker[loc_vel])
		    ilist_sorted_insert( &pdof_vels[lpdofs[i]], loc_vel);
		}
	    }
#endif
	}
#endif /* end Diri/Neumann switch */


    } /* end loop over all elements */

  /* now check max_n_pdof_vels and sum the appearances of velocity
     nodes */
  for (i=0; i<pvx_nr; i++)
    {
      struct ilist *this;
      FIDX loc_n_vels, node;

      loc_n_vels=0;
      this=pdof_vels[i];
      while (this!=NULL)
	{
	  loc_n_vels++;
	  node=this->data;
	  vel_weight[node]+=1.0;
	  this = this->next;
	}
      if ( loc_n_vels > (*K).max_n_pdof_vels )
	(*K).max_n_pdof_vels = loc_n_vels;
    }

  /* printf("max_n_pdof_vels=%d\n", (*K).max_n_pdof_vels); */

  /* correct the vel_weight to 1/sqrt(#appearances) */
  for (i=0; i<vx_nr; i++)
    {
      if (vel_weight[i]>0.0)
	{
	  /* printf("vel_weight[%3d]=%7.2f\n",i,vel_weight[i]); /* */
#ifdef BOXSMOOTHER_VEL_WEIGHTS
	  vel_weight[i] = 1.0/sqrt(vel_weight[i]);
#else
	  vel_weight[i] = 1.0;
#endif
	}
    }
#ifdef BOXSMOOTHER_SORTED

#ifdef BOXSMOOTHER_SORTED_2DIR
#define SORTER_DOUBLE (2)
#else
#define SORTER_DOUBLE (1)
#endif

#else
#define SORTER_DOUBLE (1)
#endif

  /* now create the pdof_sorter */
  TRY_MALLOC( (*K).pdof_sorter[lvl], pvx_nr*SORTER_DOUBLE, FIDX,
	      navsto_pdof_vels_init_t21);
  /* define the length of the sorter */
  (*K).pdof_sorter_n[lvl] = pvx_nr*SORTER_DOUBLE;

#ifdef BOXSMOOTHER_SORTED
  /* sort the nodes by increasig x-component first, and by increasing
     y-component second */

  /* initialise a intdouble vector to be used to sort the pressure
     nodes */
  TRY_MALLOC( sorter, pvx_nr, struct intdouble,
	      navsto_pdof_vels_init_t21);
  for (i=0; i<vx_nr; i++)
    {
      if (pdof[i]!=-1)
	{
	  /* first we use the x component as data */
	  sorter[pdof[i]].i = i; /* store the node number rather than
				     the pdof for now */
	  sorter[pdof[i]].d = (*m).vertex[i*vx_w+MCT2VXSTRT+0];
	}
    }
  /* sort by the x coordinate */
  qsort( sorter, pvx_nr, sizeof(struct intdouble), comp_intdouble_d);

  /* now for each set of equal x-s sor them by y */
  this_x       = sorter[0].d;
  this_x_start = 0;
  for (i=0; i<pvx_nr; i++)
    {
      FIDX this_node;
      if (this_x != sorter[i].d)
	{
	  /* new x is starting, so sort the old one first by y */
	  qsort( &sorter[this_x_start], i-this_x_start,
		 sizeof(struct intdouble), comp_intdouble_d);

	  /* mark the start of a new x */
	  this_x       = sorter[i].d;
	  this_x_start = i;
	}

      /* now make the y component the data */
      this_node = sorter[pdof[i]].i;
      sorter[pdof[i]].d = (*m).vertex[this_node*vx_w+MCT2VXSTRT+1];
    }
  /* sort the last set of equal x-s by y */
  qsort( &sorter[this_x_start], pvx_nr-this_x_start,
	 sizeof(struct intdouble), comp_intdouble_d);

  /* now the sorter sorts the pressure pdof by (x,y), build the pdof
     
     sorter from that, plus the inverse permutation (needed when we do
     the elements */
  TRY_MALLOC( pdof_sorter_reverse, pvx_nr*SORTER_DOUBLE, FIDX,
	      navsto_pdof_vels_init_t21);
  for (i=0; i<pvx_nr; i++)
    {
      FIDX this_node;

      this_node = sorter[pdof[i]].i;
      (*K).pdof_sorter[lvl][i] = pdof[this_node];
      pdof_sorter_reverse[pdof[this_node]]=i;
    }

#ifdef BOXSMOOTHER_SORTED_2DIR
  /* second sweep, sort the nodes by decreasig y-component first, and
     by decreasing x-component second */
  for (i=0; i<vx_nr; i++)
    {
      if (pdof[i]!=-1)
	{
	  /* first we use (-1) times the y component as data */
	  sorter[pdof[i]].i = i; /* store the node number rather than
				     the pdof for now */
	  sorter[pdof[i]].d = -(*m).vertex[i*vx_w+MCT2VXSTRT+1];
	}
    }
  /* sort by -y coordinate */
  qsort( sorter, pvx_nr, sizeof(struct intdouble), comp_intdouble_d);

  /* now for each set of equal -y-s sort them by -x */
  this_x       = sorter[0].d;
  this_x_start = 0;
  for (i=0; i<pvx_nr; i++)
    {
      FIDX this_node;
      if (this_x != sorter[i].d)
	{
	  /* new y is starting, so sort the old one first by -x */
	  qsort( &sorter[this_x_start], i-this_x_start,
		 sizeof(struct intdouble), comp_intdouble_d);

	  /* mark the start of a new x */
	  this_x       = sorter[i].d;
	  this_x_start = i;
	}

      /* now make (-1) times the x component the data */
      this_node = sorter[pdof[i]].i;
      sorter[pdof[i]].d = -(*m).vertex[this_node*vx_w+MCT2VXSTRT+0];
    }
  /* sort the last set of equal y-s by -x */
  qsort( &sorter[this_x_start], pvx_nr-this_x_start,
	 sizeof(struct intdouble), comp_intdouble_d);

  /* now the sorter sorts the pressure pdof by (-y,-x), extend the
     pdof sorter and the inverse permutation (needed when we do the
     elements */
  for (i=0; i<pvx_nr; i++)
    {
      FIDX this_node;

      this_node = sorter[pdof[i]].i;
      (*K).pdof_sorter[lvl][i+pvx_nr] = pdof[this_node];
      pdof_sorter_reverse[pdof[this_node]+pvx_nr]=i;
    }
#endif /* BOX_SMOOTHER_SORTED_2DIR */

  /* done pdof_sorter */
  free(sorter);
#else  /* #ifndef BOXSMOOTHER_SORTED */
  for (i=0; i<pvx_nr; i++)
    {
      (*K).pdof_sorter[lvl][i] = i;
    }
#endif /* #ifdef BOXSMOOTHER_SORTED */


  /* elem-based-smoother related data init */
  (*K).bc_marker_lvl[lvl]=bc_marker;
  TRY_MALLOC( (*K).elem_lvl[lvl], el_w*(*m).el_nr, FIDX,
	      navsto_pdof_vels_init_t21);
  for (i=0; i<el_w*(*m).el_nr; i++)
    (*K).elem_lvl[lvl][i]=(*m).elem[i];
  (*K).el_nr_lvl[lvl]=(*m).el_nr;

  /* now sort the elements by their highest ranking pdof, if switched
     on */
  TRY_MALLOC( (*K).elem_sorter[lvl], (*m).el_nr*SORTER_DOUBLE, FIDX,
	      navsto_pdof_vels_init_t21);
  /* define the length of the sorter */
  (*K).elem_sorter_n[lvl] = (*m).el_nr*SORTER_DOUBLE;

#ifdef BOXSMOOTHER_SORTED
  TRY_MALLOC( sorter, (*m).el_nr, struct intdouble,
	      navsto_pdof_vels_init_t21);
  for (i=0; i< (*m).el_nr; i++)
    {
      FIDX best_p, best_p_place;

      best_p = (*m).elem[i*el_w + MCT2ELNOD1 + 0 ];
      best_p_place = pdof_sorter_reverse[pdof[best_p]];
	for (j=1; j<3; j++)
	{
	  FIDX this_node;
	  this_node=(*m).elem[i*el_w + MCT2ELNOD1 + j ];
	  if (pdof_sorter_reverse[pdof[this_node]] < best_p_place)
	    {
	      best_p=this_node;
	      best_p_place = pdof_sorter_reverse[pdof[best_p]];
	    }
	}
      sorter[i].i = i;
      sorter[i].d = (double) best_p_place;
    }
  /* sort */
  qsort(sorter,(*m).el_nr,sizeof(struct intdouble), comp_intdouble_d);

  /* now store the sorted list into elem_sorter */
  for (i=0; i< (*m).el_nr; i++)
    {
      (*K).elem_sorter[lvl][i] = sorter[i].i;
    }

#ifdef BOXSMOOTHER_SORTED_2DIR
  /* second sort according to second part of pdof_sorter_reverse */
  for (i=0; i< (*m).el_nr; i++)
    {
      FIDX best_p, best_p_place;

      best_p = (*m).elem[i*el_w + MCT2ELNOD1 + 0 ];
      best_p_place = pdof_sorter_reverse[pdof[best_p]+pvx_nr];
	for (j=1; j<3; j++)
	{
	  FIDX this_node;
	  this_node=(*m).elem[i*el_w + MCT2ELNOD1 + j ];
	  if (pdof_sorter_reverse[pdof[this_node]+pvx_nr] < best_p_place)
	    {
	      best_p=this_node;
	      best_p_place = pdof_sorter_reverse[pdof[best_p]+pvx_nr];
	    }
	}
      sorter[i].i = i;
      sorter[i].d = (double) best_p_place;
    }
  /* sort */
  qsort(sorter,(*m).el_nr,sizeof(struct intdouble), comp_intdouble_d);

  /* now store the sorted list into elem_sorter, second part */
  for (i=0; i< (*m).el_nr; i++)
    {
      (*K).elem_sorter[lvl][i+(*m).el_nr] = sorter[i].i;
    }
#endif /* ifdef BOX_SMOOTHER_SORTED_2DIR */

  /* cleanup sorter business */
  free(sorter);
  free(pdof_sorter_reverse);

#else /* #ifndef BOXSMOOTHER_SORTED */
  for (i=0; i< (*m).el_nr; i++)
    {
      (*K).elem_sorter[lvl][i] = i;
    }
#endif /* #ifdef BOXSMOOTHER_SORTED */

#undef SORTER_DOUBLE

  /* free local data */
  free(ldofs);
  free(lpdofs);

  return SUCCESS;
}
  
