/***********************************************************************

    This file is part of
    FEINS, Finite Element Incompressible Navier-Stokes solver,
    which is expanding to a more general FEM solver and toolbox,
    Copyright (C) 2003--2013, Rene Schneider 
    <rene.schneider@mathematik.tu-chemnitz.de>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program. If not, see <http://www.gnu.org/licenses/>.

    Minor contributions to this program (for example bug-fixes and
    minor extensions) by third parties automatically transfer the
    copyright to the general author of FEINS, to maintain the
    possibility of commercial re-licensing. If you contribute but wish
    to keep the copyright of your contribution, make that clear in
    your contribution!

    Non-GPL licenses to this program are available upon request from
    the author.

************************************************************************/
/*
FILE assem_conv_diff.c
HEADER assem_conv_diff.h

TO_HEADER:


// useful text macros
#include "feins_macros.h"
// data structures 
#include "datastruc.h"
#include "sparse_struct.h"

*/



/* prototypes of external functions */
#include <math.h>
#include "elements.h"
#include "cubature.h"
#include "sparse.h"
#include "mesh.h"
#include "feins_lapack.h"    /* link against LAPACK! */

/* function prototypes */
#include "assem_conv_diff.h"

/*FUNCTION*/
int assem_conv_diff_tx(struct sparse *M, struct sparse *K,
                       struct vector *rhs,  struct vector *rhsdot,
		       double *maxPe, double *eps,
		       struct sparse *A, struct sparse *B,
		       double cA_K, double cA_M, 
		       double cB_K, double cB_M, 
		       struct mesh *m, int stab, double time,
		       int type
/* performs the assembly of the stiffness matrix K and the right hand
   side vector rhs which result from SUPG stabelised finite element
   discretisation of the convection-diffusion equation on the
   triangular mesh m, such that

           K x = rhs

   or for time dependent problems 
        
           M \dot{x} + K x = rhs

   defines the (approximate) solution x=(u) of

     [du/dt]  -\nabla \dot eps \nabla(u) + b * grad u = f

   with where eps is a matrix of diffusion coefficients,
   b=m.function[0] and boundary conditions as given in the mesh m 

   In addition or alternatively the matrices 
               A= cA_K*K + cA_M*M, 
               B= cB_K*K + cB_M*M
   can be computed, where the coefficients c*_* are user defined. This
   is designed in order to allow the computation of the matrices for
   time stepping schemes, e.g. the backward Euler scheme can be
   written as
               (1/tau M + K) u(k+1) = b(t) + 1/tau M u(k)
              =           A  u(k+1) = b(t) +       B u(k).            

   Input:  eps     - matrix of diffusion coefficients, eps should be
                     symmetric positive definite, eps_i,j=eps[i*dim+j]
           m       - the mesh
	   stab    - to indicate wether stabilization shall be used
	             for the advection term or not, stab==0: use no
	             stabilisation, stab==1: use SUPG stabilisation
           type    - type of mesh, ==1 means linear (P1) elements, 
                     ==2 means quadratic (P2) elements,
           cA_K, cA_M, cB_K, cB_M
                   - coefficients for assembling K and M to the
                     matrices A and B, as described above. ignored if
                     the pointer to the correspoding matrix (A or B)
                     is ==NULL     
	   time    - for time dependent problems this is the time at
	             which K,M,A,B,rhs,rhsdot are to be evaluated. for
	             stationary problems just use time=0.0
           
   In/Out: M       - mass matrix as struct sparse, initialisation
                     and finalisation should be done by the caling
                     routine, ignored if ==NULL
           K       - stiffness matrix as struct sparse, initialisation
                     and finalisation should be done by the caling
                     routine, ignored if ==NULL
           A       - A=cA_K*K+cA_M*M, as struct sparse, initialisation
                     and finalisation should be done by the caling
                     routine, ignored if ==NULL
           B       - B=cB_K*K+cB_M*M, as struct sparse, initialisation
                     and finalisation should be done by the caling
                     routine, ignored if ==NULL
	   rhs     - righthand side vector, has to be initialised by
                     the calling routine
	   rhsdot  - time derivative of the righthand side vector, has
	             to be initialised by the calling routine

   Output: maxPe   - maximum Peclet number observed in the mesh

   Return: SUCCESS - success
           FAIL    - failure, see error message, output will not be
                     valid
*/
		       ){
#pragma omp parallel
  {
  FIDX el, d, i, j, k, r, s;
  int  err;

  FIDX dim, bas_n, bas_n_2, vx_nr, eg_nr, el_nr,
    el_w, vx_w, eg_w, bd_w, fu_w, fc_w;

  /* for FORTRAN calls: */
  char fTrans='T', fNoTrans='N';
  double done=1.0, dzero=0.0;
  int fdim, fbas_n, fbas_nc; 

  FIDX l_mctxVXSTRT, l_mctxEGNOD1, l_mctxELNOD1, l_mctxBDTYPE,
    l_mctxBDEDGE;
  int cubat_order;

  double min_eps;        /* min eigenvalue of matrix eps */

  struct int_data iform; /* integration formula 2d   */
  double *bk;            /* velocity vector at the integration point */
  double fxc;            /* rhs function f(.) at the integration point */
  double *dotfxc;        /* time derivative of rhs function f(.) at
			    the integration point */
  double dotfxc_mem;     /* memory in case dotfxc is used */
  double *rhsloc;        /* element rhs */
  double *rhsdotloc;     /* element rhsdot */
  double *elK;           /* element stiffness matrix */
  double *elM;           /* element mass matrix */
  double *elAB;          /* temporary storage for cA_M*elM and the
			    other c?_?*el? */
  double *Jac, *Jacinv;  /* Jacobian of the element mapping and its
			    inverse */
  double *Jinvgrad;      /* inverse Jacobian times gradphi */
  double *Hessglob;      /* real world Hessian */
  double detJac;         /* determinant of the Jacobian */

  FIDX   m_type;
  double *phi, *gradphi, *hessphi;
  double weight;

  double *xc;            /* the center point of the element */
  double *bc;            /* velocity at the element center */
  double h, abs_bc, heg, h_max, hbc, hbcmax, pe, beta;
                         /* values used to determine the mesh peclet
			    number and the stabilization term */ 
  int    bas_nc;         /* number of linear basis functions (used in xc)*/
  double *phic, *gphic, *hphic, *gradc_glob; 
                         /* the basis function for linear elements
                            values at this point */ 

  int computeMatrices;

  computeMatrices= ( ((A!=NULL) || (B!=NULL)) || ((M!=NULL) || (K!=NULL)) );
    

  /****************   init ******************************************/
  dim   = (*m).dim;
  vx_nr = (*m).vx_nr;
  eg_nr = (*m).eg_nr;
  el_nr = (*m).el_nr;
  vx_w  = (*m).vx_w;
  el_w  = (*m).el_w;
  eg_w  = (*m).eg_w;
  bd_w  = (*m).bd_w;
  fu_w  = (*m).fu_w;
  fc_w  = (*m).fc_w;



  /* get integration formula */
  /* required order: 
      type=1:  grad u * grad v     order 0
               (b' * grad u)*v     order 1+order(b)
               f v                 order 1+order(f)
               (b' * grad u)*(b' * grad v)
                                   order 0+2*order(b)
                                   ------------------
                     use:          >=order 2*2, say 4
     --------------------------------------------------
      type=2:  grad u * grad v     order 2
               (b' * grad u)*v     order 3+order(b)
               f v                 order 2+order(f)
               (b' * grad u)*(b' * grad v)
                                   order 2+2*order(b)
                                   -----------------
                     use:          >=order 2*3, say 7
  */
  switch(type)
    {
    case 1:
      l_mctxVXSTRT = MCT1VXSTRT;
      l_mctxEGNOD1 = MCT1EGNOD1;
      l_mctxELNOD1 = MCT1ELNOD1;
      l_mctxBDTYPE = MCT1BDTYPE;
      l_mctxBDEDGE = MCT1BDEDGE;
      cubat_order=4;
      break;
    case 2:
      l_mctxVXSTRT = MCT2VXSTRT;
      l_mctxEGNOD1 = MCT2EGNOD1;
      l_mctxELNOD1 = MCT2ELNOD1;
      l_mctxBDTYPE = MCT2BDTYPE;
      l_mctxBDEDGE = MCT2BDEDGE;
      cubat_order=7;
      break;
    default:
      fprintf(stderr,"assem_conv_diff_tx: unknown type=%d\n",type);
      FEINS_FAILURE_ACTION
    }

  m_type=(FIDX) type;
  err=cubature_bases( dim, cubat_order, tria, 1, &m_type, &iform); 
  FUNCTION_FAILURE_HANDLE( err, cubature_bases, assem_conv_diff_tx);

  /* make phi and gradphi better accessible */
  phi    = (iform.bases[0]->phi);
  gradphi= (iform.bases[0]->gradphi);
  hessphi= (iform.bases[0]->hessphi);
  bas_n  = (iform.bases[0]->num_basis);

  fbas_n = (int) bas_n;
  fdim   = (int) dim;

  if (dim > 2)
    {
      /* cry */
      fprintf(stderr,
	      "assem_conv_diff_tx: dim >2 not implemented (Jacinv+?)\n");
      FEINS_FAILURE_ACTION
    }

  if ( ((K!=NULL)&&((*K).row_nr!=vx_nr))
       ||((M!=NULL)&&((*M).row_nr!=vx_nr))
       ||((A!=NULL)&&((*A).row_nr!=vx_nr)) 
       ||((B!=NULL)&&((*B).row_nr!=vx_nr))
       )
    {
      /* cry */
      fprintf(stderr,
	      "assem_conv_diff_tx: size of K,M,A,B wrong???\n");
      FEINS_FAILURE_ACTION
    }
    
  
  /* check that diffusion matrix eps is symmetric positive definite */
  /* first symmetry */
  for (i=0; i<dim; i++)
    for (j=i+1; j<dim; j++)
      {
	if (eps[i*dim+j] != eps[j*dim+i])
	  {
	    fprintf(stderr,
		    "assem_conv_diff_tx: eps not symmetric???\n");
	    FEINS_FAILURE_ACTION
	  }
      }
  {
    /* check that diffusion matrix eps is positive definite */
    char fJOBZ='N';
    char fUPLO='U';
    int lwork=(dim+2)*dim, fdim=dim;
    double *work, *evs;
    int info;
    
    TRY_MALLOC( work, lwork, double, assem_conv_diff_tx);
    TRY_MALLOC( evs, dim, double, assem_conv_diff_tx);
    dsyev_( &fJOBZ, &fUPLO, &fdim, eps, &fdim, 
	    evs, work, &lwork, &info );
    if (info==0)
      { 
	/* printf("evs= %e %e \n", evs[0], evs[1]); */
	min_eps=evs[0];
      }
    else
      {
	fprintf(stderr,
		"assem_conv_diff_tx: error determining eigenvalues of eps\n");
	FEINS_FAILURE_ACTION
      }
    free(evs);
    free(work);

  }
  if (min_eps<=0)
    {
      fprintf(stderr,
	      "assem_conv_diff_tx: eps min eigenvalue <=0\n");
      FEINS_FAILURE_ACTION
    }


  bas_n_2=bas_n*bas_n;    /* bas_n^2 */


  TRY_MALLOC( xc, dim, double, assem_conv_diff_tx);
  /* the center of gravity of the master element is
		     (1/dim,...,1/dim) */
  xc[0]=1.0/( (double)dim);  
  for (i=1; i<dim; i++)
    {
      xc[i]=xc[0];
    }
  /* evaluate the basis functions for this point */
  err=eval_basis( dim, tria, 1, 1, xc, &i, &phic, &gphic, &hphic);
  FUNCTION_FAILURE_HANDLE( err, eval_basis, assem_conv_diff_tx);
  /* free unneeded info */
  free(hphic);
  bas_nc=3;



  /* allocate memory for local data */
  TRY_MALLOC( bk, dim, double, assem_conv_diff_tx);
  TRY_MALLOC( bc, dim, double, assem_conv_diff_tx);
  if (computeMatrices)
    {
      TRY_MALLOC( elK, bas_n_2, double, assem_conv_diff_tx);
      TRY_MALLOC( elM, bas_n_2, double, assem_conv_diff_tx);
      TRY_MALLOC( elAB, bas_n_2, double, assem_conv_diff_tx);
      TRY_MALLOC( Hessglob, bas_n*dim*dim, double, assem_conv_diff_tx);
    }
  else
    { 
      elK     =NULL;
      elM     =NULL;
      elAB    =NULL;
      Hessglob=NULL;
    }

  TRY_MALLOC( Jac, dim*dim, double, assem_conv_diff_tx);
  TRY_MALLOC( Jacinv, dim*dim, double, assem_conv_diff_tx);

  TRY_MALLOC( Jinvgrad, dim*bas_n, double, assem_conv_diff_tx);

  TRY_MALLOC( gradc_glob, dim*bas_nc, double, assem_conv_diff_tx);

  TRY_MALLOC( rhsloc, bas_n, double, assem_conv_diff_tx);
  TRY_MALLOC( rhsdotloc, bas_n, double, assem_conv_diff_tx);

  /* clear K,M,A,B */
#pragma omp master
  {
  if (K!=NULL)
    {
      sparse_empty(K);
    }
  if (M!=NULL)
    {
      sparse_empty(M);
    }
  if (A!=NULL)
    {
      sparse_empty(A);
    }
  if (B!=NULL)
    {
      sparse_empty(B);
    }

  /* clear rhs */
  for (i=0; i<vx_nr; i++)
    {
      rhs->V[i]=0.0;
    }
  *maxPe=0.0;
  } /* END OPM master */

  /* clear rhsdot */
  if (rhsdot!=NULL)
    {
#pragma omp master
      {
      for (i=0; i<vx_nr; i++)
	{
	  rhsdot->V[i]=0.0;
	}
      }  /* END OPM master */
      dotfxc=&dotfxc_mem;
    }
  else
    {
      dotfxc=NULL;
    }

  /* wait till all threads have finished init phase */
#pragma omp barrier


  /* loop over all elements */
#pragma omp for schedule(static)
  for (el=0; el<el_nr; el++)
    {
      /* determine the mesh peclet number for this element */
      /* first get xc, the coordinates of the center of the element,
	 then evaluate bc, the velocity vector there */
      for (j=0; j<dim; j++)
	{
	  xc[j]=0.0;
	  for (i=0; i<bas_nc; i++)
	    xc[j]+=phic[i]
	      *(*m).vertex[(*m).elem[el*el_w+l_mctxELNOD1+i]*vx_w
			   +l_mctxVXSTRT+j];
	}
      /* velocity always given by b=m.function[0] */
      err=mesh_func_eval( m, 0, xc, time, dim, bc, NULL, NULL);
      FUNCTION_FAILURE_HANDLE( err, mesh_func_eval, assem_conv_diff_tx);
      
      /* determine norm of the velocity vector */
      abs_bc=0;
      for (j=0; j<dim; j++)
	abs_bc+=bc[j]*bc[j];
      abs_bc=sqrt(abs_bc);

      /* get the length of the element in stream-wise direction */
      /*   %% diameter of element in direction btil (Petr Knobloch)
	   %hmax2=0;
	   %for j=1:3
	   %  hmax2=hmax2+abs(b'*gradglob(:,j));
	   %end
	   %h=2*norm(b)/hmax2;
      */

      /* first compute the gradient of the linear ansatz functions */
      /* compute the Jacobian for this element */
      /* Jac=0 */
      for (i=0; i<dim*dim; i++)
	Jac[i]=0.0;
	  
      /* Jac = sum_{i=nodes} vertex(i)*gradphi_i^T */
      for (i=0;i<bas_nc; i++)
	{
	  for (j=0; j<dim; j++)
	    {
	      for (r=0; r<dim; r++)
		{
		  Jac[j*dim+r]+= 
		    (*m).vertex[(*m).elem[el*el_w+l_mctxELNOD1+i]*vx_w
				+l_mctxVXSTRT+j]
		    * gphic[0*bas_nc*dim +i*dim +r];
		}
	    }
	}

      /* get detJac */
      detJac=Jac[0]*Jac[3]-Jac[2]*Jac[1];
	  
      /* get Jacinv (here direct) */
      Jacinv[0]=1.0/detJac*Jac[3];
      Jacinv[1]=-1.0/detJac*Jac[1];
      Jacinv[2]=-1.0/detJac*Jac[2];
      Jacinv[3]=1.0/detJac*Jac[0];


      /* gradc_glob= Jacinv * gradphi[k,:,:] 
	 (=real world gradient T1)
      */
      fbas_nc=bas_nc;
      dgemm_( &fNoTrans, &fNoTrans, &fdim, &fbas_nc, &fdim,
	      &done, Jacinv, &fdim, &(gphic[0*bas_nc*dim]), &fdim,
	      &dzero, gradc_glob, &fdim );

      h_max=0.0;
      for (i=0; i<bas_nc; i++)
	{
	  hbcmax=0.0;
	  for(d=0; d<dim; d++)
	    {
	      hbcmax+=bc[d]*gradc_glob[i*dim+d];	  
	    }
	  h_max+=fabs(hbcmax);
	}
      h=2*abs_bc/h_max;

      /* this elements Peclet number Pe */
      pe=0.5*h*abs_bc/min_eps;

      /* update the maximum Pe */
#pragma omp critical
      {
	if (*maxPe<pe) *maxPe=pe;
      } /* end OMP critical */

      /* stabilization parameter beta */
      if (abs_bc!=0.0)
	{
#ifdef RAMAGE_NOT_TEZDUYAR
	  /* Ramage stab parameter */
	  //beta=(0.5*h-min_eps)/abs_bc;
	  /* John/Novo */
	  //beta=sqrt(1.0/cB_M)*h/(4*abs_bc);

	  beta=0.0;
	  /* in ROS3P with adaptive timestepping, the John/Novo
	     parameter choice appears  to generate more oszillations
	     than beta=0, and also restricts the timesteps further

	     Ramage appears to be neither advantage nor disadvantage
	     compared to beta=0, because sufficient diffusion is
	     present so stabilization switches of
	  */
	  //printf("beta=%e\n",beta);
#else
	  /* Tezduyar stab parameter */
	  beta=(0.5*h/abs_bc)*fmin(1.0, 0.33333333*pe);
#endif
	}
      else
	{
	  beta=0.0;
	}
      if ((beta<0.0)||(stab!=1)) beta=0.0;


      if (computeMatrices)
	{
	  /* set elK and elM to zero */
	  for (i=0; i<bas_n_2; i++) 
	    {
	      elK[i]=0.0;
	      elM[i]=0.0;
	    }
	}
      /* set rhsloc and rhsdotloc to zero */
      for (i=0; i<bas_n; i++)
	{
	  rhsloc[i]=0.0;
	  rhsdotloc[i]=0.0;
	}


      /* loop over all integration points */
      for (k=0; k<iform.num_points; k++)
	{
	  /* compute the Jacobian for this element */
	  /* Jac=0 */
	  for (i=0; i<dim*dim; i++)
	    Jac[i]=0.0;
	  
	  /* Jac = sum_{i=nodes} vertex(i)*gradphi_i^T */
	  for (i=0;i<bas_n; i++)
	    {
	      for (j=0; j<dim; j++)
		{
		  for (r=0; r<dim; r++)
		    {
		      Jac[j*dim+r]+= 
			(*m).vertex[(*m).elem[el*el_w+l_mctxELNOD1+i]*vx_w
				    +l_mctxVXSTRT+j]
			* gradphi[k*bas_n*dim +i*dim +r];
		    }
		}
	    }

	  /* get detJac */
	  detJac=Jac[0]*Jac[3]-Jac[2]*Jac[1];
	  
	  /* get Jacinv (here direct) */
	  Jacinv[0]=1.0/detJac*Jac[3];
	  Jacinv[1]=-1.0/detJac*Jac[1];
	  Jacinv[2]=-1.0/detJac*Jac[2];
	  Jacinv[3]=1.0/detJac*Jac[0];

	  /* the real wolrd weight of this integration point */
	  weight=fabs(detJac) * iform.weights[k];

	  /* Jinvgrad= Jacinv * gradphi[k,:,:] 
	     (=real world gradient T1)
	  */
	  dgemm_( &fNoTrans, &fNoTrans, &fdim, &fbas_n, &fdim,
		  &done, Jacinv, &fdim, &(gradphi[k*bas_n*dim]), &fdim,
		  &dzero, Jinvgrad, &fdim );

	  if (computeMatrices)
	    {
	      /* real world Hessian (neglecting second derivatives of the
		 element mapping) */
	      for (i=0;i<bas_n; i++)
		{
		  int kk,mm;
		  
		  for (r=0; r<dim; r++) 
		    for (s=0; s<dim; s++)
		      {
			Hessglob[i*dim*dim+r*dim+s]=0.0;
		      }
		  for (s=0; s<dim; s++)
		    for (kk=0; kk<dim; kk++) 
		      {
			double y_ks=0.0;
			for (mm=0; mm<dim; mm++) 
			  {
			    y_ks+=(hessphi[k*bas_n*dim*dim+i*dim*dim
					   + kk*dim + mm]
				   * Jacinv[mm*dim+s]);
			  }
			
			for (r=0; r<dim; r++) 
			  {
			    Hessglob[i*dim*dim+r*dim+s]+=
			      Jacinv[kk*dim+r]*y_ks;
			  }
		      }
		}
	      /* elK += |detJac|*weigth[k] * Jinvgrad^T*eps*Jinvgrad */
	      for (i=0; i<bas_n; i++) /* row */
		for (j=0; j<bas_n; j++) /* col */
		  for (r=0; r<dim; r++) /* index for Jinvgrad^T*y */
		    {
		      double y_r=0.0;
		      for (s=0; s<dim; s++) /* index for y=(eps*Jinvgrad) */
			{
			  y_r += eps[r*dim+s] * Jinvgrad[j*dim+s];
			}
		      elK[i*bas_n+j] += Jinvgrad[i*dim+r]*y_r * weight;
		    }
	    }


	  /* define bk[], velocity vector of convection term */
	  for (j=0; j<dim; j++)
	    {
	      xc[j]=0.0;
	      for (i=0; i<bas_n; i++)
		xc[j]+=phi[k*bas_n+i]
		  *(*m).vertex[(*m).elem[el*el_w+l_mctxELNOD1+i]*vx_w
			       +l_mctxVXSTRT+j];
	    }
	  /* velocity always given by b=m.function[0] */
	  err=mesh_func_eval( m, 0, xc, time, dim, bk, NULL, NULL);
	  FUNCTION_FAILURE_HANDLE( err, mesh_func_eval, assem_conv_diff_tx);


	  /* evaluate rhs function f(xc) (and if required its time derivative) */
	  err=mesh_func_eval ( m, (*m).face[el*fc_w+MCT1FCRHSF],
			       xc, time, 1, &fxc, NULL, dotfxc);
	  FUNCTION_FAILURE_HANDLE(err,mesh_func_eval, assem_conv_diff_tx);


	  /* elK(i,j) += int(element)(
	            sum_d ( bk[d]* (d phi_j/d x_d)) 
	            * (phi_i + beta* sum_d (bk[d] *d phi_i/d x_d)))
	     (beta>0 --> SUPG-stabilization)

	     the integration for each entry of elK and rhs :
	  */
	  for (i=0; i<bas_n; i++)
	    {
	      double b_gradphi_i=0.0;
	      for (d=0; d<dim; d++)
		{
		  b_gradphi_i+=bk[d]*Jinvgrad[i*dim+d];
		}

	      for (j=0; j<bas_n; j++)
		{
		  double b_gradphi_j=0.0;
		  double eps_laplace_gradphi_j=0.0;

		  for (d=0; d<dim; d++)
		    {
		      b_gradphi_j+=bk[d]*Jinvgrad[j*dim+d];
		    }


		  if (computeMatrices)
		    {
		      for (r=0; r<dim; r++) 
			for (s=0; s<dim; s++)
			  {
			    eps_laplace_gradphi_j+=
			      eps[r*dim+s]*Hessglob[j*dim*dim+r*dim+s];
			  }
		      
		      elK[j*bas_n+i] +=
			( (b_gradphi_j * phi[k*bas_n+i])
			  +(-eps_laplace_gradphi_j+b_gradphi_j)* beta*b_gradphi_i)
			* weight ; /* */
		      
		      elM[j*bas_n+i] +=
			( phi[k*bas_n+j]
			  *(phi[k*bas_n+i])+ beta*b_gradphi_i)
			* weight ; /* */
		    }
		}
	      /* compute rhs */
	      rhsloc[i] +=
		fxc *( phi[k*bas_n+i] + beta*b_gradphi_i)
		* weight ;

	      /* compute rhsdot */
	      if (rhsdot!=NULL)
		{
		  rhsdotloc[i] +=
		    (*dotfxc) *( phi[k*bas_n+i] + beta*b_gradphi_i)
		    * weight ;
		}
	    }
	} /* end loop intergration points */
      
      /***************************************************************
       * elK is ready, now assemble rhs, K, M, A, B                  *
       ***************************************************************/


#pragma omp critical
      {
	for (i=0; i<bas_n; i++)
	  {
	    (*rhs).V[(*m).elem[el*el_w+i]] += rhsloc[i];
	  }
      } /* end OMP critical */

#pragma omp critical
      {
	if (rhsdot!=NULL)
	  {
	    for (i=0; i<bas_n; i++)
	      {
		(*rhsdot).V[(*m).elem[el*el_w+i]] += rhsdotloc[i];
	      }
	  }
      } /* end OMP critical */

      if (K!=NULL)
	{
#pragma omp critical
	  {
	    err=sparse_add_local( K, NoTrans,
				  bas_n, &(*m).elem[el*el_w+l_mctxELNOD1], 
				  bas_n, &(*m).elem[el*el_w+l_mctxELNOD1],
				  elK, bas_n );
	    FUNCTION_FAILURE_HANDLE( err, sparse_add_local, assem_conv_diff_tx);
	  } /* end OMP critical */
	}

      if (M!=NULL)
	{
#pragma omp critical
	  {
	    err=sparse_add_local( M, NoTrans,
				  bas_n, &(*m).elem[el*el_w+l_mctxELNOD1], 
				  bas_n, &(*m).elem[el*el_w+l_mctxELNOD1],
				  elM, bas_n );
	    FUNCTION_FAILURE_HANDLE( err, sparse_add_local, assem_conv_diff_tx);
	  } /* end OMP critical */
	}

      if (A!=NULL)
	{
#pragma omp critical
	  {
	  if (cA_M!=0.0)
	    {
	      for (i=0; i<bas_n_2; i++)
		{
		  elAB[i] = cA_M*elM[i];
		}
	      err=sparse_add_local( A, NoTrans,
				    bas_n, &(*m).elem[el*el_w+l_mctxELNOD1], 
				    bas_n, &(*m).elem[el*el_w+l_mctxELNOD1],
				    elAB, bas_n );
	      FUNCTION_FAILURE_HANDLE( err, sparse_add_local, assem_conv_diff_tx);
	    }
	  if (cA_K!=0.0)
	    {
	      for (i=0; i<bas_n_2; i++)
		{
		  elAB[i] = cA_K*elK[i];
		}
	      err=sparse_add_local( A, NoTrans,
				    bas_n, &(*m).elem[el*el_w+l_mctxELNOD1], 
				    bas_n, &(*m).elem[el*el_w+l_mctxELNOD1],
				    elAB, bas_n );
	      FUNCTION_FAILURE_HANDLE( err, sparse_add_local, assem_conv_diff_tx);
	    }
	  } /* end OMP critical */
	}
      if (B!=NULL)
	{
#pragma omp critical
	  {
	  if (cB_M!=0.0)
	    {
	      for (i=0; i<bas_n_2; i++)
		{
		  elAB[i] = cB_M*elM[i];
		}
	      err=sparse_add_local( B, NoTrans,
				    bas_n, &(*m).elem[el*el_w+l_mctxELNOD1], 
				    bas_n, &(*m).elem[el*el_w+l_mctxELNOD1],
				    elAB, bas_n );
	      FUNCTION_FAILURE_HANDLE( err, sparse_add_local, assem_conv_diff_tx);
	    }
	  if (cB_K!=0.0)
	    {
	      for (i=0; i<bas_n_2; i++)
		{
		  elAB[i] = cB_K*elK[i];
		}
	      err=sparse_add_local( B, NoTrans,
				    bas_n, &(*m).elem[el*el_w+l_mctxELNOD1], 
				    bas_n, &(*m).elem[el*el_w+l_mctxELNOD1],
				    elAB, bas_n );
	      FUNCTION_FAILURE_HANDLE( err, sparse_add_local, assem_conv_diff_tx);
	    }
	  } /* end OMP critical */
	}

    } /* end loop elements */

  /* printf("lvl=%"dFIDX",  maxPe=%8.1e\n",  lvl, *maxPe); /* */

  /* free local data */
  free(gradc_glob);
  free(Hessglob);
  free(Jinvgrad);
  free(Jacinv);
  free(Jac);
  free(elK);
  free(elM);
  free(elAB);
  free(bc);
  free(xc);
  free(bk);

  free(phic);
  free(gphic);

  free_intdata (&iform);

  } /* END OMP parallel */

  return SUCCESS;
}
