/***********************************************************************

    This file is part of
    FEINS, Finite Element Incompressible Navier-Stokes solver,
    which is expanding to a more general FEM solver and toolbox,
    Copyright (C) 2003--2013, Rene Schneider 
    <rene.schneider@mathematik.tu-chemnitz.de>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program. If not, see <http://www.gnu.org/licenses/>.

    Minor contributions to this program (for example bug-fixes and
    minor extensions) by third parties automatically transfer the
    copyright to the general author of FEINS, to maintain the
    possibility of commercial re-licensing. If you contribute but wish
    to keep the copyright of your contribution, make that clear in
    your contribution!

    Non-GPL licenses to this program are available upon request from
    the author.

************************************************************************/

#include <math.h>
#ifdef fmax
#warning "math.h"
#endif

#include "feins_macros.h"
#include "datastruc.h"
#include "mesh.h"
/*#include "meshdetails.h"*/
#include "sparse.h"
#include "lin_solver.h"
#include "linsolve_umfpack.h"
#include "assembly.h"
#include "assem_conv_diff.h"
#include "gen_aux.h"
#include "mesh_deform.h"
#include "lame_adj.h"

#ifndef __USE_GNU
#define __USE_GNU
#endif

//#include <unistd.h>
#include <string.h>

#define TIMEGET {\
   gettimeofday(&tv,&tz); \
   sec=tv.tv_sec; \
   musec=tv.tv_usec; \
   ti=((double)sec+((double)musec)*1e-6); }

int main(int argc, char *argv[])
{

  struct sparse A, M, K;
  struct projector1 P;
  
  struct vector b;
  struct vector xtrial1, xtrial2, *x, *xtrial, diff, xwrite;

  struct mesh msh1;
  struct mesh msh2;

  struct solver_settings set;

  struct timeval tv;
  struct timezone tz;
  int sec, musec;
  double t0, ti, ta, tb, ts;

  int  err;
  FIDX dim, vx_nr, i, j;

  FIDX level0, lvlm, lvl;

  double Tstrt, Tend;
  FIDX tstep, write_steps;

  struct linsolver umfpackdata;
  FIDX *umfpack_isdiri;
  struct vector umfpack_help;

  double maxPe, tau;
  //double eps[]={1e-1, 1e-6, 1e-6, 1e-7};
  //double eps[]={1e-4, 0, 0, 1e-4};
  double eps[]={30, 0, 0, 30};

  int iter;
  double resi, norm;

  char *buffer;
  
  /* J.Lang and J.Verwer, ROS3P -- An accurate third order Rosenbrock
     solver designed for parabolic problems, BIT, Vol 41:4,
     pp. 731-738, 2001  

     coefficients from Table 5.1 */

  double gamma=7.886751345948129e-01;

  double a21 = 1.267949192431123e+00,  c21 = -1.607695154586736e+00;
  double a31 = 1.267949192431123e+00,  c31 = -3.464101615137755e+00;
  double /*a32 = 0.0000000000000e+00*/ c32 = -1.732050807568877e+00;

  double alpha1 = 0.000000000000000e+00, gamma1 =  7.886751345948129e-01;
  double alpha2 = 1.000000000000000e+00, gamma2 = -2.113248654051871e-01;
  double alpha3 = 1.000000000000000e+00, gamma3 = -1.077350269189626e+00;

  double m1 = 2.000000000000000e+00;
  double m2 = 5.773502691896258e-01;
  double m3 = 4.226497308103742e-01;

  double mhat1 = 2.113248654051871e+00;
  double mhat2 = 1.000000000000000e+00;
  double mhat3 = 4.226497308103742e-01;  
  
  /* output at intermediate points, 
     J.Lang, Adaptive Multilevel Solution of Nonlinear Parabolic PDE
     Systems, Springer 2001, 
     Table C.2, page 132
     equation (V.16), page 55 
  */
  double mtheta1_1=+2.5358983848622453, mtheta1_2=-0.5358983848622453;
  double mtheta2_1=-1.1547005383792515, mtheta2_2=+1.7320508075688773;
  double mtheta3_1=+1.1547005383792515, mtheta3_2=-0.7320508075688773;


 

  struct vector Un1, Un2, Un3, bdot, sumU;
  double factor1, factor2;

  // atol was 1e-7
  double rho=0.9, alpha_low=0.1, alpha_high=10, atol=1e-4, rtol=1e-5, tol;
  double time, tau_old, tau_new, err_norm_old, err_norm_new, tau_write,
    t_write, time_old;
  int not_done=1;

#ifdef HAVE_OPENMP
#pragma omp parallel
  {
#pragma omp master
    {
      printf("OpenMP num_threads=%d\n",omp_get_num_threads());
      printf("  should be number of phyisical cores, use environment variable "
	     " OMP_NUM_THREADS to modify,\n");
      printf("  e.g. (bash shell): \"export OMP_NUM_THREADS=4\" \n");
      printf("  e.g. (most other): \"set OMP_NUM_THREADS 4\" \n");
    }
  }
#endif

  if (argc>1)
    {
      printf("meshfile: %s \n", argv[1]);
    }
  else
    {
      printf("main: no mesh specified!\n");
      return FAIL;
    }
  err=mesh_read_file_t1( &msh1, &set, argv[1] ); /* */
  FUNCTION_FAILURE_HANDLE( err, mesh_read_file_t1, main);


  /* set the goals and methods */
  level0  =  set.refine_ini;
  lvlm    =  set.refine_steps;
  tau         = set.instat_delta_t0;
  write_steps = set.instat_write_steps;
  Tstrt       = set.instat_Time0;
  Tend        = set.instat_Tend;

  tau_write   = set.instat_write_delta;
  t_write     = Tstrt;

  dim=msh1.dim;

  // eps=msh1.para[0];

  for (i=0; i<level0 ; i++)
    {
      err=mesh_refine_uniform_t1( &msh1 );
      FUNCTION_FAILURE_HANDLE( err, mesh_refine_uniform_t1, main);
      
      tau         /= 2;
      write_steps *= 2;
    }

  /* reset hierarchy */
  msh1.hi_nr=0;
  for (j=0; j<msh1.eg_nr; j++)
    {
      msh1.edge[j*msh1.eg_w+MCT1EGLVL]=-1;
    }



  /* convert to coarse t2 mesh */
  err=mesh_t1_to_t2(&msh1, &msh2);
  FUNCTION_FAILURE_HANDLE( err, mesh_t1_to_t2, main);

  /* local refinement in a band of the domain */
  for (lvl=0; lvl<0*(3+1+1)*2; lvl++) 
    {
      FIDX *marker;
      FIDX nrmarked;
      // 2km x 2km:
      // double c_band[]={0.0, 1.0}, d_band=-1900.0, width_band=30.0;
      // case A:
      double c_band[]={0.0, 1.0}, d_band=-9900.0, width_band=30.0;

      TRY_MALLOC( marker, msh2.el_nr, FIDX, main);

      if (lvl>=2) width_band=20;
      if (lvl>=4) width_band=10;
      if (lvl>=8) width_band=5;

      err=gen_band_marker_tx(&msh2, marker, &nrmarked,
			     c_band, d_band, width_band, 2); 
      FUNCTION_FAILURE_HANDLE( err, gen_band_marker_tx, main);

      if (nrmarked>0)
	{
	  /* refine the mesh */
	  err=mesh_refine_adaptive_t2( &msh2, marker);
	  FUNCTION_FAILURE_HANDLE( err, mesh_refine_uniform_t2, main);
	}
      printf("refined %6"dFIDX" elements, vx_nr=%6"dFIDX"\n",
	     nrmarked,msh2.vx_nr);
	      
      
      free(marker);
    }

    


  /* refine */
  for (lvl=0; lvl<lvlm-1; lvl++)
    {
      err=mesh_refine_uniform_t2( &msh2 );
      FUNCTION_FAILURE_HANDLE( err, mesh_refine_uniform_t2, main);

      tau         /= 2;
      write_steps *= 2;
    }
  
  /* local refinement in a band of the domain */
  for (lvl=0; lvl<(3+1+1)*2-2; lvl++) 
    {
      FIDX *marker;
      FIDX nrmarked;
      // 2km x 2km:
      // double c_band[]={0.0, 1.0}, d_band=-1900.0, width_band=30.0;
      // case A:
      double c_band[]={0.0, 1.0}, d_band=-9900.0, width_band=30.0;

      TRY_MALLOC( marker, msh2.el_nr, FIDX, main);

      if (lvl>=2) width_band=20;
      if (lvl>=4) width_band=10;
      if (lvl>=8) width_band=5;

      err=gen_band_marker_tx(&msh2, marker, &nrmarked,
			     c_band, d_band, width_band, 2); 
      FUNCTION_FAILURE_HANDLE( err, gen_band_marker_tx, main);

      if (nrmarked>0)
	{
	  /* refine the mesh */
	  err=mesh_refine_adaptive_t2( &msh2, marker);
	  FUNCTION_FAILURE_HANDLE( err, mesh_refine_uniform_t2, main);
	}
      printf("refined %6"dFIDX" elements, vx_nr=%6"dFIDX"\n",
	     nrmarked,msh2.vx_nr);
	      
      
      free(marker);
    }



  vx_nr=msh2.vx_nr; 

  TRY_MALLOC(buffer, strlen(argv[1])+10, char, main);



  /* allocate memory for matrix K and the vectors */
  err=sparse_flex_alloc( &A, vx_nr, FEINS_SPARSE_COLS_P_ROW_T2);
  err=sparse_flex_alloc( &M, vx_nr, FEINS_SPARSE_COLS_P_ROW_T2);
  err=sparse_flex_alloc( &K, vx_nr, FEINS_SPARSE_COLS_P_ROW_T2);
  err=projector1_alloc( &P, vx_nr );

  x=&xtrial1;
  xtrial=&xtrial2;
  err=vector_alloc( x, vx_nr );
  err=vector_alloc( xtrial, vx_nr );
  err=vector_alloc( &diff, vx_nr );
  err=vector_alloc( &xwrite, vx_nr );
  err=vector_alloc( &b, vx_nr );
  err=vector_alloc( &bdot, vx_nr );

  err=vector_alloc( &Un1, vx_nr );
  err=vector_alloc( &Un2, vx_nr );
  err=vector_alloc( &Un3, vx_nr );

  err=vector_alloc( &sumU, vx_nr );

  /* set vectors to zero */
  for (i=0; i<vx_nr; i++)
    {
      (*x).V[i]    =0.0;
    }


  /* assemble the system matrices for the time stepping  */
  {
    // need to put cB_M=1/tau for stabilisation parameter
    err=assem_conv_diff_tx( &M, &K,  &b, NULL, &maxPe, eps,
			    &A, NULL, 1.0, 1.0/(tau*gamma), 0.0, 1.0/tau,
			    &msh2, 1, Tstrt, 2);
    FUNCTION_FAILURE_HANDLE( err, assem_conv_diff_tx, main);
  }


  /* set the Dirichlet boundary conditions */
  err=assem_Dirichlet_BC_tx( &P, &msh2, 1, x, 2);
  FUNCTION_FAILURE_HANDLE( err, assem_Dirichlet_BC_tx, main);


  /* time loop */
  /* x=x(0)=0  is given above */
  not_done = 1;
  tau_old  = 0.0;
/* old time */
  time=Tstrt;
  time_old=time;

  /* to this point the time step size was reduced like the space step
     size, the error allowed in each time step is to be of the order
     of the spatial discretisation error to achieve optimal combined
     order, we use P2 elements, thus the L2 error should be h^3 */
  rtol=pow(tau,3.0)/(Tend-Tstrt); 

  /* atol=rtol; */
  printf("atol=%8.2e   rtol=%8.2e\n", atol, rtol);
  for (tstep=0; not_done; tstep++)
    {
      /* set all stage values to zero */
      for (i=0; i<vx_nr; i++)
	{
	  Un1.V[i]= 0.0;
	  Un2.V[i]= 0.0;
	  Un3.V[i]= 0.0;
	}

      /* for tstep==0 (time=Tstart) use the same output routine as
	 during time advance,
	 only start time advance at tstep==1 */
      if (tstep>0)
	{

	  /****************/
	  /*              */
	  /*   Stage 1    */
	  /*              */
	  /****************/

	  TIMEGET;
	  t0=ti;
	  
	  err=assem_conv_diff_tx( NULL, NULL,  &b, &bdot, &maxPe, eps,
				  NULL, NULL, 0.0, 0.0, 0.0, 1.0/tau,
				  &msh2, 1, time+alpha1*tau, 2); /* */
	  FUNCTION_FAILURE_HANDLE( err, assem_conv_diff_tx, main);

	  TIMEGET;
	  ta=ti-t0;
	  t0=ti;

	  /* b:= b- K*x(k-1),   to use it as RHS for the linear solves */
	  for (i=0; i<vx_nr; i++)
	    {
	      sumU.V[i] = - (*x).V[i];
	    }
	  err=sparse_mul_mat_vec_add( &K, &sumU, &b);
	  FUNCTION_FAILURE_HANDLE( err, sparse_mul_mat_vec_add, main);

	  /* b:= b+ tau*gamma1*bdot   */
	  factor1=tau*gamma1;
	  for (i=0; i<vx_nr; i++)
	    {
	      b.V[i] += factor1*bdot.V[i];
	    }

	  TIMEGET;
	  tb=ti-t0;

	  /* now solve A*x=b */
	  /* for now only direct solver UMFPACK */
	  {
	    int task;
	    
	    TIMEGET;
	    t0=ti;

	    task=1;
	    /* due to adaptive time stepping, tau and therefore matrix
	       A change in every tstep 
	       if (tstep<=1)
	       {
	       task=1;
	       }
	       else
	       {
	       task=2;
	       } /* */

	    err=sparse_solve_UMFPACK_reuse( &Un1, &A, &b, &P, &umfpackdata,
					    &umfpack_isdiri, &umfpack_help, task );
	    FUNCTION_FAILURE_HANDLE( err, sparse_solve_UMFPACK_reuse, main); /* */
	    
	    TIMEGET;
	    ts=ti-t0;
	  }


	  /****************/
	  /*              */
	  /*   Stage 2    */
	  /*              */
	  /****************/

	  TIMEGET;
	  t0=ti;
	  
	  err=assem_conv_diff_tx( NULL, NULL,  &b, &bdot, &maxPe, eps,
				  NULL, NULL, 0.0, 0.0, 0.0, 1.0/tau,
				  &msh2, 1, time+alpha2*tau, 2); /* */
	  FUNCTION_FAILURE_HANDLE( err, assem_conv_diff_tx, main);

	  TIMEGET;
	  ta+=ti-t0;
	  t0=ti;

	  /* b:= b- K*( x(k-1)+a21*Un1 ),   to use it as RHS for the linear solves */
	  for (i=0; i<vx_nr; i++)
	    {
	      sumU.V[i] = - (*x).V[i] - a21*Un1.V[i];
	    }
	  err=sparse_mul_mat_vec_add( &K, &sumU, &b);
	  FUNCTION_FAILURE_HANDLE( err, sparse_mul_mat_vec_add, main);
	  
	  /* b:= b + M*( c21/tau*Un1 ),   to use it as RHS for the linear solves */
	  factor1=c21/tau;
	  for (i=0; i<vx_nr; i++)
	    {
	      sumU.V[i] = factor1*Un1.V[i];
	    }
	  err=sparse_mul_mat_vec_add( &M, &sumU, &b);
	  FUNCTION_FAILURE_HANDLE( err, sparse_mul_mat_vec_add, main);

	  /* b:= b+ tau*gamma2*bdot   */
	  factor1=tau*gamma2;
	  for (i=0; i<vx_nr; i++)
	    {
	      b.V[i] += factor1*bdot.V[i];
	    }

	  TIMEGET;
	  tb+=ti-t0;
	  t0=ti;

	  /* now solve A*x=b */
	  /* for now only direct solver UMFPACK */
	  err=sparse_solve_UMFPACK_reuse( &Un2, &A, &b, &P, &umfpackdata,
					    &umfpack_isdiri, &umfpack_help, 2 );
	  FUNCTION_FAILURE_HANDLE( err, sparse_solve_UMFPACK_reuse, main); /* */
	  
	  TIMEGET;
	  ts+=ti-t0;



	  /****************/
	  /*              */
	  /*   Stage 3    */
	  /*              */
	  /****************/

	  TIMEGET;
	  t0=ti;
	  
	  err=assem_conv_diff_tx( NULL, NULL,  &b, &bdot, &maxPe, eps,
				  NULL, NULL, 0.0, 0.0, 0.0, 1.0/tau,
				  &msh2, 1, time+alpha3*tau, 2); /* */
	  FUNCTION_FAILURE_HANDLE( err, assem_conv_diff_tx, main);

	  TIMEGET;
	  ta+=ti-t0;
	  t0=ti;

	  /* b:= b- K*( x(k-1)+a31*Un1+a32*Un2 ) !a32=0.0! => leave out>*/
	  for (i=0; i<vx_nr; i++)
	    {
	      sumU.V[i] = - (*x).V[i] - a31*Un1.V[i];
	    }
	  err=sparse_mul_mat_vec_add( &K, &sumU, &b);
	  FUNCTION_FAILURE_HANDLE( err, sparse_mul_mat_vec_add, main);
	  
	  /* b:= b + M*( c21/tau*Un1 ),   to use it as RHS for the linear solves */
	  factor1=c31/tau;
	  factor2=c32/tau;
	  for (i=0; i<vx_nr; i++)
	    {
	      sumU.V[i] = factor1*Un1.V[i] + factor2*Un2.V[i];
	    }
	  err=sparse_mul_mat_vec_add( &M, &sumU, &b);
	  FUNCTION_FAILURE_HANDLE( err, sparse_mul_mat_vec_add, main);

	  /* b:= b+ tau*gamma3*bdot   */
	  factor1=tau*gamma3;
	  for (i=0; i<vx_nr; i++)
	    {
	      b.V[i] += factor1*bdot.V[i];
	    }


	  TIMEGET;
	  tb+=ti-t0;
	  t0=ti;

	  /* now solve A*x=b */
	  /* for now only direct solver UMFPACK */
	  err=sparse_solve_UMFPACK_reuse( &Un3, &A, &b, &P, &umfpackdata,
					    &umfpack_isdiri, &umfpack_help, 2 );
	  FUNCTION_FAILURE_HANDLE( err, sparse_solve_UMFPACK_reuse, main); /* */
	  
	  /* clean up umfpack data */
	  err=sparse_solve_UMFPACK_reuse( &Un3, &A, &b, &P, &umfpackdata,
					    &umfpack_isdiri, &umfpack_help, 9 );
	  FUNCTION_FAILURE_HANDLE( err, sparse_solve_UMFPACK_reuse, main); /* */

	  TIMEGET;
	  ts+=ti-t0;

	  /* update x, and compute diff=x-xhat */ 
	  for (i=0; i<vx_nr; i++)
	    {
	      double diffi, xnew;
	      
	      diffi   = m1*Un1.V[i] + m2*Un2.V[i] + m3*Un3.V[i];


	      xnew   = (*x).V[i] + diffi;

	      (*xtrial).V[i] = xnew;

	      diffi  -= mhat1*Un1.V[i] + mhat2*Un2.V[i] + mhat3*Un3.V[i];
	      diff.V[i] = diffi;
	    }
	  /* compute norm_M(x) and norm_M(x-xhat) */
	  norm=0.0;
	  err=sparse_vec_mat_vec(&M, xtrial, xtrial, &norm);
	  FUNCTION_FAILURE_HANDLE( err, sparse_vec_mat_vec, main);

	  err_norm_new=0.0;
	  err=sparse_vec_mat_vec(&M, &diff, &diff, &err_norm_new);
	  FUNCTION_FAILURE_HANDLE( err, sparse_vec_mat_vec, main);


	  printf("tstep=%5"dFIDX"  time=%8.6f  tau=%8.2e  |x|=%8.2e |x-xhat|/|x|=%8.2e   solver UMFPACK, size=%6"dFIDX", t_assem=%8.2e, t_mul=%8.2e, t_solver=%8.2e ", tstep, time, tau, sqrt(norm), sqrt(err_norm_new)/sqrt(norm), (*x).len, ta, tb, ti-t0);

	  tol=fmax(rtol*sqrt(norm),atol);

	  /* no tau_old, err_norm_old -> use simple formula (IV.7) */
	  factor1=rho*pow(tol/sqrt(err_norm_new), 1.0/3.0);

	  /* step size control from Jens Lang, Adaptive Multilevel
	     Solution of Nonlinear Parabolic PDE Systems, Springer
	     2001, formula (IV.10) */
	  if (tau_old!=0.0)
	    {
	      /* use (IV.10) */
	      factor2=rho*tau/tau_old*pow(tol*err_norm_old/err_norm_new, 1.0/3.0);

	      if (factor2<factor1) { factor1=factor2; }
	    }

	  /* safeguard against unrealistic quick changes in tau */
	  if (factor1<alpha_low) factor1=alpha_low;
	  else if (factor1>alpha_high) factor1=alpha_high;
	  
	  tau_new=factor1*tau;

	  if (err_norm_new<tol*tol)
	    {
	      /* accept step */
	      struct vector *tmp;

	      tmp=x;
	      x=xtrial;
	      xtrial=tmp;

	      time_old=time;
	      time += tau;

	      tau_old=tau;
	      err_norm_old=sqrt(err_norm_new);

	      printf(" accepted\n");

	      if (time>Tend)
		{
		  not_done=0;
		}
	    } /* otherwise it is rejected */
	  else
	    {
	      printf(" rejected\n");
	    }


	  /* try/use the new tau */
	  tau=tau_new;


	} /* end not first tstep */


      while (time>=t_write)
	{
	  double theta, mt1, mt2, mt3; 
	  char fname[200];

	  
	  /*sprintf(fname, "visual/conv_diff_vtk_%05"dFIDX"_%12.6e",
	    tstep,t_write);*/

	  /* sprintf(fname, "visual/conv_diff_vtk_%07ld%07"dFIDX,
	     lround((t_write-Tstrt)/tau_write),tstep); */
	  sprintf(fname, "visual/conv_diff_vtk_%07ld",
		  lround((t_write-Tstrt)/tau_write));

	  /* interpolate the solution to intermediate time t_write */
	  if (time-time_old>0)
	    {
	      theta=(t_write-time_old)/(time-time_old);
	    }
	  else
	    {
	      theta=0;
	    }

	  mt1=theta*(mtheta1_1+mtheta1_2*theta);
	  mt2=theta*(mtheta2_1+mtheta2_2*theta);
	  mt3=theta*(mtheta3_1+mtheta3_2*theta);

	  for (i=0; i<vx_nr; i++)
	    {
	      /* xtrial was accepted, so it became x, at the same time
		 xtrial was replaced by x(old) */
	      
	      xwrite.V[i] = (*xtrial).V[i] 
		+ mt1*Un1.V[i] + mt2*Un2.V[i] + mt3*Un3.V[i];
	    }
	  

	  /* write the solution for paraview visualisation */
	  err=mesh_write_solution_vtk_tx( &msh2, NULL, 0, &xwrite, 1,
					  strlen(fname)+1,fname );
	  FUNCTION_FAILURE_HANDLE( err, mesh_write_solution_vtk_tx,
				   main); /* */

	  t_write += tau_write;
	}


      if ((tau<=0.0)||(isnan(tau)))
	{
	  fprintf(stderr,"main: invalid tau, stop time integration \n");
	  not_done=0;
	}

    } /* end time loop */

  /* clean up */

  sparse_free(&A);
  sparse_free(&M);
  sparse_free(&K);

  projector1_free(&P);

  vector_free(&sumU);

  vector_free(&Un3);
  vector_free(&Un2);
  vector_free(&Un1);

  vector_free(&bdot);
  vector_free(&b);
  vector_free(&xwrite);
  vector_free(&diff);
  vector_free(xtrial);
  vector_free(x);


  mesh_free(&msh2);
  mesh_free(&msh1);

  free(buffer);

  return SUCCESS;


}
