/***********************************************************************

    This file is part of
    FEINS, Finite Element Incompressible Navier-Stokes solver,
    which is expanding to a more general FEM solver and toolbox,
    Copyright (C) 2003--2013, Rene Schneider 
    <rene.schneider@mathematik.tu-chemnitz.de>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program. If not, see <http://www.gnu.org/licenses/>.

    Minor contributions to this program (for example bug-fixes and
    minor extensions) by third parties automatically transfer the
    copyright to the general author of FEINS, to maintain the
    possibility of commercial re-licensing. If you contribute but wish
    to keep the copyright of your contribution, make that clear in
    your contribution!

    Non-GPL licenses to this program are available upon request from
    the author.

************************************************************************/
/*
FILE sparse.c
HEADER sparse.h

TO_HEADER:

// config (by cmake)
#include "config.h"

// useful text macros
#include "feins_macros.h"

// data structures 
#include "sparse_struct.h"
#include "sparse_inline_functions.h"

*/


/* prototypes of external functions */
#include <math.h>
#include "feins_lapack.h"
#include "linsolve_umfpack.h"
#include "linsolve_hypre_AMG.h"
#include "mesh.h"  /* for comp_intdouble_d */

// data structures imported
#include "datastruc.h"

/* function prototypes */
#include "sparse.h"

/*FUNCTION*/
int sparse_flex_alloc(struct sparse *K, FIDX nrows, FIDX cols_p_row
/* creates a sparse matrix in the initial flexible format with nrows rows 
   
   Input:  nrows   - number of rows of the matrix,
           cols_p_row
                   - number of columns per row, this should be at
                     least the average number of columns per row,
                     if it is too small, it will cause large CPU
                     overhead during assembly of the matrix, if to
                     large, it causes memory waste
		     the average number of columns per row under 
		     uniform refinement:
		     t1 -> 7
		     t2 -> 12 
		     e1 -> 15  -> vector valued  -> 3*15 ~ 45
		     e2 -> 29  -> vector valued  -> 3*29 ~ 87 

   Output: K       - (K is given by reference), memory for the
                     neccessary fields in K is allocated,
		     dimensions of K initialised, type set to flexible

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
		 ){
  FIDX i;

  TRY_MALLOC( (*K).flex_row_col_nr, nrows, FIDX, sparse_flex_alloc);
  TRY_MALLOC( (*K).flex_row_cols, nrows, struct sparse_flex_col_block,
	      sparse_flex_alloc);

#ifdef HAVE_OPENMP
#pragma omp parallel
  {
#pragma omp master
    {
      (*K).nr_processors = omp_get_num_threads();
    }
  }
#else
  (*K).nr_processors = 1;
#endif

  TRY_MALLOC( (*K).flex_reserve, (*K).nr_processors, 
	      struct sparse_flex_reserve_list, sparse_flex_alloc);

  (*K).flex_row_col_max = cols_p_row;

  /* initialise each row */
  for (i=0; i<nrows; i++)
    {
      TRY_MALLOC( (*K).flex_row_cols[i].cols_idx,
		  (*K).flex_row_col_max, FIDX, sparse_flex_alloc);
      TRY_MALLOC( (*K).flex_row_cols[i].cols_data,
		  (*K).flex_row_col_max, double, sparse_flex_alloc);
      (*K).flex_row_cols[i].cols_idx[0]  = i;
      (*K).flex_row_cols[i].cols_data[0] = 0.0;
      (*K).flex_row_col_nr[i]  = 1;
      (*K).flex_row_cols[i].next = NULL;
    }
  /* initialise the reserve cols lists */
  { 
    FIDX nr_reserve_blocks=1000;

    for (i=0; i<(*K).nr_processors; i++)
      {
	FIDX j;
	TRY_MALLOC( (*K).flex_reserve[i].blocks,
		    nr_reserve_blocks, struct sparse_flex_col_block,
		    sparse_flex_alloc);
	
	for (j=0; j<nr_reserve_blocks; j++)
	  {
	    struct sparse_flex_col_block *block;
	    block=&(*K).flex_reserve[i].blocks[j];
	    TRY_MALLOC( (*block).cols_idx,
			(*K).flex_row_col_max, FIDX, sparse_flex_alloc);
	    TRY_MALLOC( (*block).cols_data,
			(*K).flex_row_col_max, double, sparse_flex_alloc);
	    (*block).next = NULL;
	  }
	(*K).flex_reserve[i].nr=0;
	(*K).flex_reserve[i].max=nr_reserve_blocks;
	(*K).flex_reserve[i].next=NULL;
      }
  } /* end init reserve blocks */


  (*K).row_nr   = nrows;
  (*K).type     = SP_TYPE_FLEX;

  /* data not used at the moment */
  (*K).rows     = NULL;
  (*K).cols     = NULL;
  (*K).A        = NULL;  

  (*K).DD_proc_nr   = 0;
  (*K).DD_waves_nr  = 0;
  (*K).DD_nr        = 0;
  (*K).DD_rows_idx  = NULL;
  (*K).DD_rows      = NULL;

  return SUCCESS;
}


/*FUNCTION*/
int sparse_set_DD(struct sparse *K, FIDX DD_proc_nr, FIDX DD_waves_nr,
		  FIDX *DD_node_idx, FIDX *DD_node, FIDX comps
/* set the DD info of the sparse struct, for OMP parallelisation 
   
   Input:  nrows   - number of rows of the matrix,
           density - not used!, only present for backward compatibility
	   DD_proc_nr
	           - number of processors for which the DD was created
	   DD_waves_nr
	           - number of waves of DD_proc_nr independent domains
	   DD_node_idx
                   - DD_node_idx[i] index of start of subdomain i
		     in DD_node (length DD_nr+1)
	   DD_node - list of nodes in each subdomain
	   comps   - number of block rows of the matrix for vector
	             valued PDE problems, comps=1 for scalar PDES

   In/Out: K       - (K is given by reference), the DD_* parts are set

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
		 ){
  FIDX i, j, cmp, DD_nr, vx_nr;
  
  DD_nr = DD_proc_nr*DD_waves_nr;
  vx_nr = DD_node_idx[DD_nr];

  /* check number of nodes times comps matches rows in the matrix */
  if (vx_nr*comps != (*K).row_nr)
    {
      fprintf(stderr,"sparse_set_DD: "
	      "number of nodes*comp does not match K.row_nr\n"
	      "          %"dFIDX"*%"dFIDX"=%d, K.row_nr=%"dFIDX"\n",
	      vx_nr,comps,vx_nr*comps,(*K).row_nr);
      return FAIL;
    }

  TRY_MALLOC( (*K).DD_rows_idx, DD_nr+1, FIDX, sparse_set_DD);
  TRY_MALLOC( (*K).DD_rows, (*K).row_nr, FIDX, sparse_set_DD);

  (*K).DD_rows_idx[0]=0;
  for (i=0; i<DD_nr; i++)
    {
      FIDX this_nr=DD_node_idx[i+1]-DD_node_idx[i];
      FIDX this_rows_start=(*K).DD_rows_idx[i];
      FIDX this_nodes_start=DD_node_idx[i];

      (*K).DD_rows_idx[i+1]=this_rows_start+comps*this_nr;

      for (cmp=0; cmp<comps; cmp++)
	for (j=0; j<this_nr; j++)
	  {
	    (*K).DD_rows[this_rows_start+cmp*this_nr+j]
	      = cmp*vx_nr+DD_node[this_nodes_start+j];
	  }
    }

  (*K).DD_proc_nr  = DD_proc_nr;
  (*K).DD_waves_nr = DD_waves_nr;
  (*K).DD_nr       = DD_nr;
  
  return SUCCESS;
}




/*FUNCTION*/
void sparse_init(struct sparse *K
/* initialises K to be empty, such that it can be emptied without
   problems 
   
   Input:  nrows   - number of rows of the matrix,
           density - maximal number of nonzeros per row,

   Output: K       - (K is given by reference), memory for the
                     neccessary fields in K is allocated,
		     dimensions of K initialised

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
		 ){
  (*K).type      = SP_TYPE_UNDEF;
  (*K).row_nr    = 0;

  (*K).rows      = NULL;
  (*K).cols      = NULL;
  (*K).A         = NULL;  

  (*K).flex_row_col_nr  = NULL;
  (*K).flex_row_col_max = 0;
  (*K).flex_row_cols    = NULL;
  (*K).nr_processors    = 0;
  (*K).flex_reserve     = NULL;

  (*K).DD_proc_nr   = 0;
  (*K).DD_waves_nr  = 0;
  (*K).DD_nr        = 0;
  (*K).DD_rows_idx  = NULL;
  (*K).DD_rows      = NULL;

  return;
}

/*FUNCTION*/
void sparse_free(struct sparse *K
/* memory allocated in K is freed, so that K can be freed

   Output: K       - (K is given by reference) inside memory is
                     released 
*/
		  ){
  FIDX i;

  switch ( (*K).type )
    {
    case SP_TYPE_FLEX:
      for (i=0; i<(*K).row_nr; i++)
	{
	  free( (*K).flex_row_cols[i].cols_idx );
	  free( (*K).flex_row_cols[i].cols_data );
	}
      for (i=0; i<(*K).nr_processors; i++)
	{
	  FIDX j;
	  struct sparse_flex_reserve_list *this, *head, *next;
	  head=&(*K).flex_reserve[i];
	  this=head;
	  while (this!=NULL)
	    {
	      FIDX nr_reserve_blocks = (*this).max;

	      for (j=0; j<nr_reserve_blocks; j++)
		{
		  struct sparse_flex_col_block *block;
		  block=&(*this).blocks[j];
		  free( (*block).cols_idx );
		  free( (*block).cols_data );
		}
	      free( (*this).blocks );

	      next=(*this).next;
	      if (this!=head)
		{
		  free(this);
		}
	      this=next;
	    }
	}
      free((*K).flex_row_cols);
      free((*K).flex_row_col_nr);
      (*K).flex_row_col_max=0;
      (*K).flex_row_col_nr  = NULL;
      (*K).flex_row_cols    = NULL;
      (*K).nr_processors    = 0;
      free( (*K).flex_reserve );
      (*K).flex_reserve     = NULL;
      break;
    case SP_TYPE_COMPROW:
      free((*K).rows);
      (*K).rows=NULL;
      free((*K).cols);
      (*K).cols=NULL;
      free((*K).A);
      (*K).A=NULL;
    }

  (*K).type = SP_TYPE_UNDEF;
  (*K).row_nr   = 0;

  (*K).DD_proc_nr   = 0;
  (*K).DD_waves_nr  = 0;
  (*K).DD_nr        = 0;
  if ( (*K).DD_rows_idx != NULL) free(  (*K).DD_rows_idx );
  (*K).DD_rows_idx  = NULL;
  if ( (*K).DD_rows != NULL) free(  (*K).DD_rows );
  (*K).DD_rows      = NULL;

}


/*FUNCTION*/
void sparse_empty(struct sparse *K
/* the sparse matrix K is set to zero, 
   if the matrix is in compressed row storage, the sparsity structure
   is preserved
   if the matrix is in flexible format, it will be reset to a diagonal
   matrix of zeros

   Output: K       - (K is given by reference) is set to zero matrix
*/
		  ){
  FIDX i,m;

  /* DD_rows and other DD_* stuff is to be left as is */

  switch( (*K).type )
    {
    case SP_TYPE_FLEX:
      /* zero each row */
      for (i=0; i<(*K).row_nr; i++)
	{
	  (*K).flex_row_cols[i].cols_idx[0]  = i;
	  (*K).flex_row_cols[i].cols_data[0] = 0.0;
	  (*K).flex_row_col_nr[i]  = 1;
	  /* the next column block stays connected, if it was assigned */
	}
      /* the reserve column blocks stay as they are */
      break;
    case SP_TYPE_COMPROW:
      m=(*K).rows[(*K).row_nr];
      for (i=0; i<m; i++)
	{
	  (*K).A[i]=0.0;
	}
      break;
    default:
      fprintf(stderr,"sparse_empty: type %d not implemented\n",
	      (*K).type );
    }
  
  return;
}

/*FUNCTION*/
int sparse_row_empty(struct sparse *K, FIDX row 
/* the row-th row of the sparse matrix K is set to zero

   Input:  row     - number of the row to be emptied

   Output: K       - (K is given by reference) is set to zero matrix

   Return: SUCCESS - success
           FAIL    - failure, the row doesn't belong to the matrix
*/
		  ){
  FIDX i,m;

  if ((row<0)||(row>=(*K).row_nr))
    {
      fprintf(stderr, "sparse_row_empty: row %"dFIDX" invalid !\n",  row);
      return FAIL;
    }

  /* DD_rows and other DD_* stuff is to be left as is */

  switch( (*K).type)
    {
    case SP_TYPE_FLEX:
      (*K).flex_row_cols[row].cols_idx[0]  = row;
      (*K).flex_row_cols[row].cols_data[0] = 0.0;
      (*K).flex_row_col_nr[row]  = 1;
      /* the next column block stays connected, if it was assigned */
      break;
    case SP_TYPE_COMPROW:
      m = (*K).rows[row+1];
      for (i=(*K).rows[row]; i<m; i++)
	{
	  (*K).A[i]=0.0;
	}
      break;
    default:
      fprintf(stderr, "sparse_row_empty: invalid type!\n");
      return FAIL;
    }
  return SUCCESS;
}


/*FUNCTION*/
int sparse_mat_write_file(struct sparse *K, char *name
/* writes the contents of the sparse matrix to a file with the
   specified name
   
   Input:  K       - sparse matrix, original
           name    - name for the file

   Output: (the file)

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
			  ){
  FIDX i, j, m, row_nr;

  FILE *out;

  row_nr=(*K).row_nr;


 /* open the file */
  out=fopen(name, "w");
  if (out==NULL)
    {
      fprintf(stderr,
	      "sparse_mat_write_file: error opening file \"%s\"\n",
	      name);
      return FAIL;
    }
  switch( (*K).type )
    {
    case SP_TYPE_FLEX: 
      /* loop over all rows */
      for (i=0; i<row_nr; i++)
	{
	  struct sparse_flex_col_block *block;
	  FIDX col_max=(*K).flex_row_col_max;

	  /* loop over the collumns of this row */
	  block=&(*K).flex_row_cols[i];
	  for (j=0; j<(*K).flex_row_col_nr[i]; j++)
	    {
	      fprintf(out, "  %7"dFIDX"  %7"dFIDX"   %+24.16e\n",
		      (i+1),  (*block).cols_idx[j%col_max]+1,
		      (*block).cols_data[j%col_max]); /* matlab counts indices
						     from 1, we from 0 */
	      /* at end of block */
	      if ( (j%col_max)==col_max-1 )
		{
		  /* switch to next block */
		  block=(*block).next;
		}
	    }
	  /* end row */
	}
      break;
    case SP_TYPE_COMPROW: 
      /* loop over all rows */
      for (i=0; i<row_nr; i++)
	{
	  m=(*K).rows[i+1];
	  for (j=(*K).rows[i]; j<m; j++)
	    {
	      /* fprintf(out, "%7"dFIDX" %7"dFIDX" %+16.8e\n", /* for smaller files*/
	      fprintf(out, "  %7"dFIDX"  %7"dFIDX"   %+24.16e\n",
		       (i+1),  ( (*K).cols[j] + 1 ),
		      (*K).A[j]); /* matlab counts indices
				     from 1, we from 0 */
	    }
	  /* end row */
	}
      break;
    default:
      fprintf(stderr,
	      "sparse_mat_write_file: invalid type!\n");
      return FAIL;
    }

  /* close the file */
  fclose(out);

  return SUCCESS;
}

/*FUNCTION*/
int sparse_mat_read_file(struct sparse *A, char *name
/* writes the contents of the sparse matrix to a file with the
   specified name
   
   Input:  name    - name for the file

   Output: A       - sparse matrix in flexible storage format,
                     is initialised and storage allocated here
           
   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
			  ){
  FIDX   i, j, row_nr;
  FIDX   int_i, int_j; 
  int    happy, err;
  double value;

  FILE *in;

 /* open the file */
  in=fopen(name, "r");
  if (in==NULL)
    {
      fprintf(stderr,
	      "sparse_mat_read_file: error opening file \"%s\"\n",
	      name);
      return FAIL;
    }

  row_nr=0;
  happy=1;
  while (happy!=0)
    {
      int nr;
      nr = fscanf(in, "%"dFIDX"  %"dFIDX"   %le\n", &int_i, &int_j, &value);
      if (nr==3)
	{
	  if (int_i>row_nr) row_nr=int_i;
	}
      else
	{
	  happy=0;
	}
    }

  /* back to begin of file */
  rewind(in);

  err=sparse_flex_alloc(A, row_nr, 30);
  FUNCTION_FAILURE_HANDLE( err, sparse_flex_alloc, sparse_mat_read_file);
  
  /* actually read the file */
  happy=1;
  while (happy!=0)
    {
      int nr;
      double *entry;
      nr = fscanf(in, "%"dFIDX"  %"dFIDX"   %le\n", &int_i, &int_j, &value);
      if (nr==3)
	{
	  /* matlab counts indices from 1, we from 0 */
	  i=(FIDX) int_i-1;
	  j=(FIDX) int_j-1;
	  err=sparse_get_entry( A, i, j, 1, &entry);
	  if (err!=SUCCESS)
	    {
	      fprintf(stderr, "sparse_mat_read_file: "
		      "coulnd get/create entry in K for "
		      "row=%"dFIDX" col=%"dFIDX" ",  i,  j);
	      fclose(in);
	      return FAIL;
	    }
	  (*entry)+=value;
	}
      else
	{
	  happy=0;
	}
    }

  /* close the file */
  fclose(in);

  return SUCCESS;
}


/*FUNCTION*/
int sparse_mat_transp(struct sparse *A, struct sparse *B
/* copies a the transpose of the sparse matrix A into B,
   B = A^T;

   B is always created in flexible format

   Input:  A       - sparse matrix, original, may be flexible or
                     compressed row format
		     must be N-by-N matrix

   Output: B       - sparse matrix, transpose, internal sizes are set
                     and memory allocated,

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
		     ){
  int  err;
  FIDX i, j, endrow, row_nr, cols_p_row;
  double *entry;

  row_nr=(*A).row_nr;

  switch( (*A).type )
    {
    case SP_TYPE_FLEX:
      cols_p_row= (*A).flex_row_col_max;
      break;
    case SP_TYPE_COMPROW:
      cols_p_row= (((*A).rows[row_nr]/row_nr+1)*3)/2+1;
      break;
    default:
      fprintf(stderr,
	      "sparse_mat_transp: invalid type (1)!\n");
      return FAIL;
    }

  

  err=sparse_flex_alloc( B, row_nr, cols_p_row);
  FUNCTION_FAILURE_HANDLE( err, sparse_flex_alloc, sparse_mat_transp);

  switch( (*A).type )
    {
    case SP_TYPE_FLEX:
      /* loop over all rows */
      for (i=0; i<row_nr; i++)
	{
	  struct sparse_flex_col_block *block;
	  FIDX col_max=(*A).flex_row_col_max;

	  /* loop over the collumns of this row */
	  block=&(*A).flex_row_cols[i];
	  for (j=0; j<(*A).flex_row_col_nr[i]; j++)
	    {
	      err=sparse_get_entry( B, (*block).cols_idx[j%col_max], i, 
				    1, &entry);
	      FUNCTION_FAILURE_HANDLE( err, sparse_get_entry,
				       sparse_mat_transp);
  
	      /* B(col,i) = A(i,col) */
	      (*entry) = (*block).cols_data[j%col_max];

	      /* at end of block */
	      if ( (j%col_max)==col_max-1 )
		{
		  /* switch to next block */
		  block=(*block).next;
		}
	    }
	} /* end row */
      break;
    case SP_TYPE_COMPROW:
      /* loop over all rows */
      for (i=0; i<row_nr; i++)
	{
	  endrow=(*A).rows[i+1];
	  for (j=(*A).rows[i]; j<endrow; j++)
	    {
	      err=sparse_get_entry( B, (*A).cols[j], i, 1, &entry);
	      FUNCTION_FAILURE_HANDLE( err, sparse_get_entry,
				       sparse_mat_transp);

	      /* B(col,i) = A(i,col) */
	      (*entry) = (*A).A[j];
	      /* next collumn */
	    }
	  /* end row */
	}
      break;
    default:
      fprintf(stderr,
	      "sparse_mat_transp: invalid type (2)!\n");
      return FAIL;
    }

  /* copy the DD_data */
  (*B).DD_proc_nr  = (*A).DD_proc_nr;
  (*B).DD_waves_nr = (*A).DD_waves_nr;
  (*B).DD_nr       = (*A).DD_nr;
  TRY_MALLOC( (*B).DD_rows_idx, (*B).DD_nr+1, FIDX, sparse_mat_transp);
  TRY_MALLOC( (*B).DD_rows, row_nr, FIDX, sparse_mat_transp);
  memcpy( (*B).DD_rows_idx, (*A).DD_rows_idx, ((*B).DD_nr+1)*sizeof(FIDX) );
  memcpy( (*B).DD_rows, (*A).DD_rows, row_nr*sizeof(FIDX) );
  

  return SUCCESS;
}


/*FUNCTION*/
int sparse_mul_mat_vec(void *arg1, struct vector *vec,
		       struct vector *out
/* multiplies the sparse matrix  K from left to the vector vec,
   
   out = K * vec;

   Input:  arg1=
           K       - sparse matrix
           vec     - vector

   Output: out     - resulting vector

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
		     ){
  FIDX col_max, row_max;

  struct sparse *K;
  K=arg1;

  row_max = (*K).row_nr;
  col_max = (*vec).len;

  if ( (*K).type == SP_TYPE_FLEX )
    {
      int err;
     /* fprintf( stderr, "WARNING: sparse_mul_mat_vec, \n"
	       "coverting matrix from flexible to compressed "
	       "row format \n\n"); */
      err = sparse_convert_compressed_row( K );
      FUNCTION_FAILURE_HANDLE( err, sparse_convert_compressed_row, 
			       sparse_mul_mat_vec );
    }
  if ( (*K).type != SP_TYPE_COMPROW )
    {
      fprintf( stderr, "sparse_mul_mat_vec: "
	       "matrix has incompatible type \n");
      return FAIL;
    }

  if (row_max != (*out).len)
    {
      fprintf(stderr, "sparse_mul_mat_vec: "
	      "supplied vector out has wrong size\n");
      fprintf(stderr, "row_max=%"dFIDX", out.len=%"dFIDX"\n",
	       row_max,  (*out).len);
      return FAIL;
    }

  if (((*K).DD_rows_idx==NULL)||((*K).DD_rows==NULL))
    {
      fprintf(stderr,
	      "sparse_mul_mat_vec: K.DD_*not set\n");
      return FAIL;
    }

#pragma omp parallel
  {
  FIDX i, j, endrow, col;
  
  FIDX irow;
  int my_id, OMP_num_threads, DD_num_threads, wave;

  /* simpliefied acces to A, input vector iv and output ov */
  double *A, *iv, *ov;
  double axrow;
  FIDX   *Arows, *Acols;



  A       = (*K).A;
  Arows   = (*K).rows;
  Acols   = (*K).cols;
  iv      = (*vec).V;
  ov      = (*out).V;


#ifdef HAVE_OPENMP
  my_id      = omp_get_thread_num();
  OMP_num_threads = omp_get_num_threads();
#else
  my_id      = 0;
  OMP_num_threads = 1;
#endif
  if (OMP_num_threads<(*K).DD_proc_nr)
    {
      fprintf(stderr,
	      "sparse_mul_mat_vec: wrong number of treads\n"
	      "   is %d < K.DD_proc_nr=%d\n",
	      OMP_num_threads, (*K).DD_proc_nr);
      FEINS_FAILURE_ACTION
    }
  DD_num_threads=(*K).DD_proc_nr;

  /* loop over all rows */
  for (wave=0; wave<(*K).DD_waves_nr; wave++)
    {
      if (my_id<DD_num_threads)
	{
	  for (irow=(*K).DD_rows_idx[wave*DD_num_threads+my_id]; 
	       irow<(*K).DD_rows_idx[wave*DD_num_threads+my_id+1];
	       irow++)
    {
      i=(*K).DD_rows[irow];

      axrow=0.0;
      /* loop over the collumns of this row */
      endrow = Arows[i+1];
      for (j=Arows[i]; j<endrow; j++)
	{
	  col=Acols[j];


#ifdef DEBUGFEINS
	  if (col>=col_max)
	    {
	      fprintf(stderr,
		      "sparse_mul_mat_vec: invalid col>=col_max\n");
	      FEINS_FAILURE_ACTION;
	    }
#endif

	  axrow += A[j] * iv[col];
	  /* next collumn */
	}
      ov[i]=axrow;
    } /* next row */
	} /* end  if (my_id<DD_num_threads) */
# pragma omp barrier
    } /* end for wave */
  } /* end omp parallel */

  return SUCCESS;
}



/*FUNCTION*/
int sparse_GS_sweep_fwd(struct sparse *K, struct vector *b,
			struct vector *invdiag, FIDX sweeps,
			struct vector *x
/* applies sweeps Gauss-Seidel (GS) sweeps to improve the x as
   solution of 

    K * x = b;

   Input:  K       - sparse matrix
           b       - right hand side vector
           invdiag - inverse of the diagonal elements (1/a_ii)
           sweeps  - number of GS sweeps to be performed during this call

   In/Out: x       - resulting vector

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
		     ){
  if ( (*K).type == SP_TYPE_FLEX )
    {
      int err;
      /* fprintf( stderr, "WARNING: sparse_GS_sweep_fwd, \n"
	       "coverting matrix from flexible to compressed "
	       "row format \n\n"); */
      err = sparse_convert_compressed_row( K );
      FUNCTION_FAILURE_HANDLE( err, sparse_convert_compressed_row, 
			       sparse_GS_sweep_fwd );
    }
  if ( (*K).type != SP_TYPE_COMPROW )
    {
      fprintf( stderr, "sparse_GS_sweep_fwd: "
	       "matrix has incompatible type \n");
      return FAIL;
    }


  if ((((*K).row_nr != (*x).len)||(((*K).row_nr != (*b).len)))
      ||((*K).row_nr != (*invdiag).len))
    {
      fprintf(stderr, "sparse_GS_sweep_fwd: "
	      "supplied vector has wrong size\n");
      fprintf(stderr, "K.(*K).row_nr=%"dFIDX", b.len=%"dFIDX", invdiag.len=%"dFIDX", x.len=%"dFIDX"\n",
	       (*K).row_nr,  (*b).len, 
	       (*invdiag).len,  (*x).len);
      return FAIL;
    }

  if (((*K).DD_rows_idx==NULL)||((*K).DD_rows==NULL))
    {
      fprintf(stderr,
	      "sparse_GS_sweep_fwd: K.DD_*not set\n");
      return FAIL;
    }

#pragma omp parallel
  {
  FIDX i, j, endrow, sweep, col, col_max, row_max;
  FIDX irow;
  int my_id, OMP_num_threads, DD_num_threads, wave;

  /* simpliefied acces to A, b, x and invdiag */
  double *A, *bv, *xv, *invdiagV;
  double axrow;
  FIDX   *Arows, *Acols;



  row_max = (*K).row_nr;
  col_max = (*x).len;

  A       = (*K).A;
  Arows   = (*K).rows;
  Acols   = (*K).cols;
  bv      = (*b).V;
  xv      = (*x).V;
  invdiagV= (*invdiag).V;

#ifdef HAVE_OPENMP
  my_id      = omp_get_thread_num();
  OMP_num_threads = omp_get_num_threads();
#else
  my_id      = 0;
  OMP_num_threads = 1;
#endif
  if (OMP_num_threads<(*K).DD_proc_nr)
    {
      fprintf(stderr,
	      "sparse_GS_sweep_fwd: wrong number of treads\n"
	      "   is %d < K.DD_proc_nr=%d\n",
	      OMP_num_threads, (*K).DD_proc_nr);
      FEINS_FAILURE_ACTION
    }
  DD_num_threads=(*K).DD_proc_nr;

  for (sweep=0; sweep<sweeps; sweep++)
    {
      /* loop over all rows */
    for (wave=0; wave<(*K).DD_waves_nr; wave++)
      {
        if (my_id<DD_num_threads)
	  {
             for (irow=(*K).DD_rows_idx[wave*DD_num_threads+my_id]; 
                  irow<(*K).DD_rows_idx[wave*DD_num_threads+my_id+1];
                  irow++)
        {
          i=(*K).DD_rows[irow];
	  
	  axrow=0.0;

	  /* loop over the collumns of this row */
	  endrow = Arows[i+1];
	  for (j=Arows[i]; j<endrow; j++)
	    {
	      col=Acols[j];

#ifdef DEBUGFEINS
	      if (col>=col_max)
		{
		  fprintf(stderr,
			  "sparse_GS_sweep_fwd: invalid col>=col_max\n");
		  FEINS_FAILURE_ACTION;
		}
#endif

	      axrow += A[j] * xv[col];
	      /* next collumn */
	    }

	  /* multiplication of this row complete, apply GS correction
	     to x */
	  //#pragma omp ordered
	  xv[i] -= (axrow - bv[i]) * invdiagV[i];
	} /* next row */
    } /* end  if (my_id<DD_num_threads) */
# pragma omp barrier
  } /* end for wave */
	
  } /* next sweep */

  } /* end omp parallel */
  return SUCCESS;
}

/*FUNCTION*/
int sparse_GS_sweep_bwd(struct sparse *K, struct vector *b,
			struct vector *invdiag, FIDX sweeps,
			struct vector *x
/* applies sweeps Gauss-Seidel (GS) sweeps to improve the x as
   solution of 

    K * x = b;

   Input:  K       - sparse matrix
           b       - right hand side vector
           invdiag - inverse of the diagonal elements (1/a_ii)
           sweeps  - number of GS sweeps to be performed during this call

   In/Out: x       - resulting vector

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
		     ){
  FIDX col_max, row_max;
  int err;

  row_max = (*K).row_nr;
  col_max = (*x).len;

  if ( (*K).type == SP_TYPE_FLEX )
    {
      fprintf( stderr, "WARNING: sparse_GS_sweep_bwd, \n"
	       "coverting matrix from flexible to compressed "
	       "row format \n\n");
      err = sparse_convert_compressed_row( K );
      FUNCTION_FAILURE_HANDLE( err, sparse_convert_compressed_row, 
			       sparse_GS_sweep_bwd );
    }
  if ( (*K).type != SP_TYPE_COMPROW )
    {
      fprintf( stderr, "sparse_GS_sweep_bwd: "
	       "matrix has incompatible type \n");
      return FAIL;
    }

  if (((*K).DD_rows_idx==NULL)||((*K).DD_rows==NULL))
    {
      fprintf(stderr,
	      "sparse_GS_sweep_bwd: K.DD_*not set\n");
      return FAIL;
    }

  if (((row_max != (*x).len)||((row_max != (*b).len)))
      ||(row_max != (*invdiag).len))
    {
      fprintf(stderr, "sparse_GS_sweep_bwd: "
	      "supplied vector has wrong size\n");
      fprintf(stderr, "K.row_max=%"dFIDX", b.len=%"dFIDX", invdiag.len=%"dFIDX", x.len=%"dFIDX"\n",
	       row_max,  (*b).len,
	       (*invdiag).len,  (*x).len);
      return FAIL;
    }

#pragma omp parallel
  {
  FIDX i, j, endrow, sweep, col;

  FIDX irow;
  int my_id, OMP_num_threads, DD_num_threads, wave;

  /* simpliefied acces to A, b, x and invdiag */
  double *A, *bv, *xv, *invdiagV;
  double axrow;
  FIDX   *Arows, *Acols;


  A       = (*K).A;
  Arows   = (*K).rows;
  Acols   = (*K).cols;
  bv      = (*b).V;
  xv      = (*x).V;
  invdiagV= (*invdiag).V;

#ifdef HAVE_OPENMP
  my_id      = omp_get_thread_num();
  OMP_num_threads = omp_get_num_threads();
#else
  my_id      = 0;
  OMP_num_threads = 1;
#endif
  if (OMP_num_threads<(*K).DD_proc_nr)
    {
      fprintf(stderr,
	      "sparse_GS_sweep_bwd: wrong number of treads\n"
	      "   is %d < K.DD_proc_nr=%d\n",
	      OMP_num_threads, (*K).DD_proc_nr);
      FEINS_FAILURE_ACTION
    }
  DD_num_threads=(*K).DD_proc_nr;


  for (sweep=0; sweep<sweeps; sweep++)
    {
      /* loop over all rows */
      for (wave=(*K).DD_waves_nr-1; wave>=0; wave--)
	{
	  if (my_id<DD_num_threads)
	    {
	      for (irow=(*K).DD_rows_idx[wave*DD_num_threads+my_id+1]-1; 
		   irow>=(*K).DD_rows_idx[wave*DD_num_threads+my_id];
		   irow--)
        {
	  i=(*K).DD_rows[irow];
	  
	  axrow=0.0;

	  /* loop over the collumns of this row */
	  endrow = Arows[i+1];
	  for (j=Arows[i]; j<endrow; j++)
	    {
	      col=Acols[j];

#ifdef DEBUGFEINS
	      if (col>=col_max)
		{
		  fprintf(stderr,
			  "sparse_GS_sweep_bwd: invalid col>=col_max\n");
		  FEINS_FAILURE_ACTION;
		}
#endif

	      axrow += A[j] * xv[col];

	      /* next collumn */
	    }

	  /* multiplication of this row complete, apply GS correction
	     to x */
	  xv[i] -= (axrow - bv[i]) * invdiagV[i];

	} /* next row */
	    } /* end  if (my_id<DD_num_threads) */
# pragma omp barrier
	} /* end for wave */
    } /* next sweep */
  } /* end omp parallel */

  return SUCCESS;
}


/*FUNCTION*/
int sparse_GS_sweep_sorted(struct sparse *K, struct vector *b,
			   struct vector *invdiag, 
			   FIDX *sorter, int dir, FIDX sweeps,
			   double scale, struct vector *x
/* applies sweeps Gauss-Seidel (GS) sweeps to improve the x as
   solution of 

    K * x = b;

   Input:  K       - sparse matrix
           b       - right hand side vector
           invdiag - inverse of the diagonal elements (1/a_ii)
	   sorter  - permutation of the dofs that sorts them in a
	             particular maner, e.g. stream wise
           dir     - direction of the sweeps dir=+1 =>fwd, dir=-1 =>bwd
           sweeps  - number of GS sweeps to be performed during this
                     call
	   scale   - multiply the update by this factor, used for
	             damping, 0<scale<1 is advisable
	   

   In/Out: x       - resulting vector

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
		     ){
  FIDX idx, srt, end, row, endrow, j, sweep, col, col_max, row_max;
  int err;

  /* simpliefied acces to A, b, x and invdiag */
  double *A, *bv, *xv, *invdiagV;
  double axrow;
  FIDX   *Arows, *Acols;


  if ( (*K).type == SP_TYPE_FLEX )
    {
      fprintf( stderr, "WARNING: sparse_GS_sweep_sorted, \n"
	       "coverting matrix from flexible to compressed "
	       "row format \n\n");
      err = sparse_convert_compressed_row( K );
      FUNCTION_FAILURE_HANDLE( err, sparse_convert_compressed_row, 
			       sparse_GS_sweep_sorted );
    }
  if ( (*K).type != SP_TYPE_COMPROW )
    {
      fprintf( stderr, "sparse_GS_sweep_sorted: "
	       "matrix has incompatible type \n");
      return FAIL;
    }

#warning "to be changed for DD_rows"

  row_max = (*K).row_nr;
  col_max = (*x).len;

  A       = (*K).A;
  Arows   = (*K).rows;
  Acols   = (*K).cols;
  bv      = (*b).V;
  xv      = (*x).V;
  invdiagV= (*invdiag).V;

  if (((row_max != (*x).len)||((row_max != (*b).len)))
      ||(row_max != (*invdiag).len))
    {
      fprintf(stderr, "sparse_GS_sweep_sorted: "
	      "supplied vector has wrong size\n");
      fprintf(stderr, "K.row_max=%"dFIDX", b.len=%"dFIDX", invdiag.len=%"dFIDX", x.len=%"dFIDX"\n",
	       row_max,  (*b).len,
	       (*invdiag).len,  (*x).len);
      return FAIL;
    }

  /* set begin and end according to direction */
  switch(dir)
    {
    case +1:
      srt=0;
      end=row_max;
      break;
    case -1:
      srt=row_max-1;
      end=-1;
      break;
    default:
      fprintf(stderr, "sparse_GS_sweep_sorted: "
	      "direction has to be +1 or -1\n");
      return FAIL;
    }
    

  for (sweep=0; sweep<sweeps; sweep++)
    {
      /* loop over sorted rows */
      for (idx=srt; idx!=end; idx+=dir)
	{
	  row=sorter[idx];

	  axrow=0.0;

	  /* loop over the collumns of this row */
	  endrow = Arows[row+1];
	  for (j=Arows[row]; j<endrow; j++)
	    {
	      col=Acols[j];

#ifdef DEBUGFEINS
	      if (col>=col_max)
		{
		  fprintf(stderr,
			  "sparse_GS_sweep_sorted: invalid col>=col_max\n");
		  return FAIL;
		}
#endif

	      axrow += A[j] * xv[col];

	      /* next collumn */
	    }

	  /* multiplication of this row complete, apply GS correction
	     to x */
	  xv[row] -= (axrow - bv[row]) * invdiagV[row] * scale;
	} /* next row */
    } /* next sweep */
  return SUCCESS;
}



/*FUNCTION*/
int sparse_vec_mat_vec(struct sparse *K, struct vector *u,
		       struct vector *v, double *out
/* computes the product
     out = v^T *K* u
   for the sparse matrix K and the vectors u and v

   Input:  K       - sparse matrix
           u       - vector
           v       - vector

   Output: out     - scalar, given by reference

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
		     ){
  FIDX i, j, endrow, col, col_max, row_max;
  int err;

  double *A, *uv, *vv;
  double axrow;
  FIDX   *Arows, *Acols;

  if ( (*K).type == SP_TYPE_FLEX )
    {
      fprintf( stderr, "WARNING: sparse_vec_mat_vec, \n"
	       "coverting matrix from flexible to compressed "
	       "row format \n\n");
      err = sparse_convert_compressed_row( K );
      FUNCTION_FAILURE_HANDLE( err, sparse_convert_compressed_row, 
			       sparse_vec_mat_vec );
    }
  if ( (*K).type != SP_TYPE_COMPROW )
    {
      fprintf( stderr, "sparse_vec_mat_vec: "
	       "matrix has incompatible type \n");
      return FAIL;
    }

#warning "to be changed for DD_rows"


  row_max = (*K).row_nr;
  col_max = (*u).len;

  A       = (*K).A;
  Arows   = (*K).rows;
  Acols   = (*K).cols;
  uv      = (*u).V;
  vv      = (*v).V;

  if (row_max != (*v).len)
    {
      fprintf(stderr, "sparse_vec_mat_vec: "
	      "supplied vector v has wrong size\n");
      fprintf(stderr, "row_max=%"dFIDX", v.len=%"dFIDX"\n",
	       row_max,  (*v).len);
      return FAIL;
    }

  /* init out=0 */
  *out=0.0;

  /* loop over all rows of K */
  for (i=0; i<row_max; i++)
    {
      axrow=0.0;
      /* loop over the collumns of this row */
      endrow = Arows[i+1];
      for (j=Arows[i]; j<endrow; j++)
	{
	  col=Acols[j];

#ifdef DEBUGFEINS
	  if (col>=col_max)
	    {
	      fprintf(stderr,
		      "sparse_vec_mat_vec: invalid col>=col_max\n");
	      return FAIL;
	    }
#endif

	  axrow += A[j] * uv[col];

	  /* next collumn */
	}
      *out+=vv[i]*axrow;
    } /* next row */
  return SUCCESS;
}



/*FUNCTION*/
int sparse_mul_mat_vec_add(struct sparse *K, struct vector *vec,
			   struct vector *out
/* multiplies the sparse matrix  K from left to the vector vec and
   adds the result to out,
   
   out = out + K * vec;

   Input:  K       - sparse matrix
           vec     - vector

   In/Out: out     - resulting vector

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
		     ){
  FIDX col_max, row_max;

  row_max = (*K).row_nr;
  col_max = (*vec).len;

  if ( (*K).type == SP_TYPE_FLEX )
    {
      int err;
     /* fprintf( stderr, "WARNING: sparse_mul_mat_vec_add, \n"
	       "coverting matrix from flexible to compressed "
	       "row format \n\n"); */
      err = sparse_convert_compressed_row( K );
      FUNCTION_FAILURE_HANDLE( err, sparse_convert_compressed_row, 
			       sparse_mul_mat_vec_add );
    }
  if ( (*K).type != SP_TYPE_COMPROW )
    {
      fprintf( stderr, "sparse_mul_mat_vec_add: "
	       "matrix has incompatible type \n");
      return FAIL;
    }

  if (row_max != (*out).len)
    {
      fprintf(stderr, "sparse_mul_mat_vec_add: "
	      "supplied vector out has wrong size\n");
      fprintf(stderr, "row_max=%"dFIDX", out.len=%"dFIDX"\n",
	       row_max,  (*out).len);
      return FAIL;
    }

#warning "to be changed for DD_rows"


#pragma omp parallel
  {
  FIDX i, j, endrow, col;

  /* simpliefied acces to A, input vector iv and output ov */
  double *A, *iv, *ov;
  double axrow;
  FIDX   *Arows, *Acols;

  A       = (*K).A;
  Arows   = (*K).rows;
  Acols   = (*K).cols;
  iv      = (*vec).V;
  ov      = (*out).V;


  /* loop over all rows */
#pragma omp for
  for (i=0; i<row_max; i++)
    {
      axrow=0.0;
      /* loop over the collumns of this row */
      endrow = Arows[i+1];
      for (j=Arows[i]; j<endrow; j++)
	{
	  col=Acols[j];

#ifdef DEBUGFEINS
	  if (col>=col_max)
	    {
	      fprintf(stderr,
		      "sparse_mul_mat_vec_add: invalid col>=col_max\n");
	      FEINS_FAILURE_ACTION;
	    }
#endif

	  axrow += A[j] * iv[col];

	  /* next collumn */
	}
      ov[i] += axrow;
    } /* next row */
  } /* end omp parallel */

  return SUCCESS;
}


/*FUNCTION*/
int sparse_row_tim_vec(struct sparse *K, FIDX rown, FIDX *rows, 
		       struct vector *vec, struct vector *out
/* multiplies the specified rows of the sparse matrix  K from left to
   the vector vec, 
   
   out[i] = K_rows[i] * vec;     

   Input:  K       - sparse matrix
           rown    - number of given rows
	   rows    - integer vector of length rown, specifying the
                     rows for which the product shall be computed
           vec     - vector

   Output: out     - vector containing the results,
                     out[i] = K_rows[i] * vec

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
		     ){
  FIDX i, j, endrow, col, col_max, row_max;
  int err;

  double *A, *iv, *ov;
  double axrow;
  FIDX   *Arows, *Acols;

  if ( (*K).type == SP_TYPE_FLEX )
    {
      fprintf( stderr, "WARNING: sparse_row_tim_vec, \n"
	       "coverting matrix from flexible to compressed "
	       "row format \n\n");
      err = sparse_convert_compressed_row( K );
      FUNCTION_FAILURE_HANDLE( err, sparse_convert_compressed_row, 
			       sparse_row_tim_vec );
    }
  if ( (*K).type != SP_TYPE_COMPROW )
    {
      fprintf( stderr, "sparse_row_tim_vec: "
	       "matrix has incompatible type \n");
      return FAIL;
    }


  row_max=(*K).row_nr;
  col_max=(*vec).len;

  A       = (*K).A;
  Arows   = (*K).rows;
  Acols   = (*K).cols;
  iv      = (*vec).V;
  ov      = (*out).V;

  if (rown != (*out).len)
    {
      fprintf(stderr, "sparse_row_tim_vec: "
	      "supplied vector out has wrong size\n");
      fprintf(stderr, "rown=%"dFIDX", out.len=%"dFIDX"\n",
	       rown,  (*out).len);
      return FAIL;
    }

  /* loop over the given rows */
  for (i=0; i<rown; i++)
    {
      if ((rows[i]<0)||(rows[i]>=row_max))
	{
	  fprintf(stderr, "sparse_row_tim_vec: "
		  "rows[%"dFIDX"]=%"dFIDX" out of range!\n",
		   i,  rows[i]);
	  fprintf(stderr, "row_max=%"dFIDX"\n",  row_max);
	  return FAIL;
	}


      axrow=0.0;

      /* loop over the collumns of this row */
      endrow = Arows[rows[i]+1];
      for (j=Arows[rows[i]]; j<endrow; j++)
	{
	  col=Acols[j];

#ifdef DEBUGFEINS
	  if (col>=col_max)
	    {
	      fprintf(stderr,
		      "sparse_rows_tim_vec: invalid col>=col_max\n");
	      return FAIL;
	    }
#endif

	  axrow += A[j] * iv[col];

	  /* next collumn */
	}
      ov[i] = axrow;
    }
  return SUCCESS;
}



/*FUNCTION*/
int sparse_row_tim_vec_add(struct sparse *K, FIDX rown, FIDX *rows, 
		       struct vector *vec, struct vector *out
/* multiplies the specified rows of the sparse matrix  K from left to
   the vector vec and adds the result to out,
   
   out[i] = out[i] + K_row[i] * vec;

   Input:  K       - sparse matrix
           rown    - number of given rows
	   rows    - integer vector of length rown, specifying the
                     rows for which the product shall be computed
           vec     - vector

   In/Out: out     - vector containing the results,
                     out[i] = out[i] + K_row[i] * vec

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
		     ){
  FIDX i, j, endrow, col, col_max, row_max;
  int err;

  double *A, *iv, *ov;
  double axrow;
  FIDX   *Arows, *Acols;

  if ( (*K).type == SP_TYPE_FLEX )
    {
      fprintf( stderr, "WARNING: sparse_row_tim_vec_add, \n"
	       "coverting matrix from flexible to compressed "
	       "row format \n\n");
      err = sparse_convert_compressed_row( K );
      FUNCTION_FAILURE_HANDLE( err, sparse_convert_compressed_row, 
			       sparse_row_tim_vec_add );
    }
  if ( (*K).type != SP_TYPE_COMPROW )
    {
      fprintf( stderr, "sparse_row_tim_vec_add: "
	       "matrix has incompatible type \n");
      return FAIL;
    }


  row_max=(*K).row_nr;
  col_max=(*vec).len;

  A       = (*K).A;
  Arows   = (*K).rows;
  Acols   = (*K).cols;
  iv      = (*vec).V;
  ov      = (*out).V;

  if (rown != (*out).len)
    {
      fprintf(stderr, "sparse_row_tim_vec_add: "
	      "supplied vector out has wrong size\n");
      fprintf(stderr, "rown=%"dFIDX", out.len=%"dFIDX"\n",
	       rown,  (*out).len);
      return FAIL;
    }

  /* loop over the given rows */
  for (i=0; i<rown; i++)
    {
      if ((rows[i]<0)||(rows[i]>=row_max))
	{
	  fprintf(stderr, "sparse_row_tim_vec_add: "
		  "rows[%"dFIDX"]=%"dFIDX" out of range!\n",
		   i,  rows[i]);
	  fprintf(stderr, "row_max=%"dFIDX"\n",  row_max);
	  return FAIL;
	}


      axrow=0.0;

      /* loop over the collumns of this row */
      endrow = Arows[rows[i]+1];
      for (j=Arows[rows[i]]; j<endrow; j++)
	{
	  col=Acols[j];

#ifdef DEBUGFEINS
	  if (col>=col_max)
	    {
	      fprintf(stderr,
		      "sparse_rows_tim_vec_add: invalid col>=col_max\n");
	      return FAIL;
	    }
#endif

	  axrow += A[j] * iv[col];

	  /* next collumn */
	}
      ov[i] += axrow;
    }
  return SUCCESS;
}



/*FUNCTION*/
int sparse_mul_mat_vec_add_trans(struct sparse *K, struct vector *vec,
				 struct vector *out
/* multiplies the transpose of the sparse matrix  K from left to the
   vector vec and adds the result to out,
   
   out = out + K^T * vec;

   Input:  K       - sparse matrix
           vec     - vector

   In/Out: out     - resulting vector

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
		     ){
  FIDX i, j, endrow, col, col_max, row_max;
  int err;

  double *A, *iv, *ov;
  FIDX   *Arows, *Acols;

  if ( (*K).type == SP_TYPE_FLEX )
    {
      fprintf( stderr, "WARNING: sparse_mul_mat_vec_add_trans, \n"
	       "coverting matrix from flexible to compressed "
	       "row format \n\n");
      err = sparse_convert_compressed_row( K );
      FUNCTION_FAILURE_HANDLE( err, sparse_convert_compressed_row, 
			       sparse_mul_mat_vec_add_trans );
    }
  if ( (*K).type != SP_TYPE_COMPROW )
    {
      fprintf( stderr, "sparse_mul_mat_vec_add_trans: "
	       "matrix has incompatible type \n");
      return FAIL;
    }

#warning "to be changed for DD_rows"

  row_max=(*K).row_nr;
  col_max=(*out).len;
  
  A       = (*K).A;
  Arows   = (*K).rows;
  Acols   = (*K).cols;
  iv      = (*vec).V;
  ov      = (*out).V;

  if (row_max != (*vec).len)
    {
      fprintf(stderr, "sparse_mul_mat_vec_add_trans: "
	      "supplied vector vec has wrong size\n");
      return FAIL;
    }

  /* loop over all rows */
  for (i=0; i<row_max; i++)
    {
      /* loop over the collumns of this row */
      endrow = Arows[i+1];
      for (j=Arows[i]; j<endrow; j++)
	{
	  col=Acols[j];

#ifdef DEBUGFEINS
	  if (col>=col_max)
	    {
	      fprintf(stderr, "sparse_mul_mat_vec_add_trans: "
		      "invalid col>=col_max, row=%"dFIDX"  nnz=%"dFIDX" "
		      "col=%"dFIDX" col_max=%"dFIDX"\n",
		       i,  j,  col,  col_max);
	      return FAIL;
	    }
#endif

	  ov[col] += A[j] * iv[i];

	  /* next collumn */
	}
    } /* next row */
  return SUCCESS;
}




/*FUNCTION*/
int sparse_convert_compressed_row(struct sparse *K
/* converts a matrix stored in flexible mode to compressed row mode

   In/Out:  K      - sparse matrix, storage type is converted

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
				  ){
  FIDX max_cols;
  FIDX *Arows, *Acols;
  double *A;

  if ((*K).type != SP_TYPE_FLEX)
    {
      fprintf( stderr, "sparse_convert_compressed_row: "
	       "matrix must be type SP_TYPE_FLEX !\n");
      return FAIL;
    }

#ifdef HAVE_OPENMP
  if (omp_in_parallel())
    {
      fprintf(stderr,"sparse_convert_compressed_row: "
	      "error, this should not be called from within "
	      "an OMP parallel region!\n");
      FEINS_FAILURE_ACTION;
    }
#endif

#pragma omp parallel
  {
  FIDX i, j, row_nr;
  struct intdouble *sorter;
  FIDX *this_cols;
  double *this_data;

  row_nr = (*K).row_nr;

#pragma omp master
  {
    FIDX count_large_rows=0, count_extra_large_rows=0;

    /* allocate the rows vector, then the entries in each row */
    TRY_MALLOC( Arows, row_nr+1, FIDX, sparse_convert_compressed_row);

    /* count columns in each row */
    /* loop over all rows */
    Arows[0]=0;
    max_cols=0;
    for (i=0; i<row_nr; i++)
      {
	/* end of this row */
	Arows[i+1]=Arows[i]+(*K).flex_row_col_nr[i];
	if ((*K).flex_row_col_nr[i]>max_cols)
	  max_cols=(*K).flex_row_col_nr[i];
	if ( (*K).flex_row_col_nr[i]> (*K).flex_row_col_max )
	  count_large_rows++;
	if ( (*K).flex_row_col_nr[i]> 2*(*K).flex_row_col_max )
	  count_extra_large_rows++;
      }
    
    /* warn if more than 1% rows longer than flex_row_col_max */
    if (count_large_rows*100 > row_nr )
      {
	fprintf(stderr,"sparse_convert_compressed_row: "
		"warning, %"dFIDX"rows (%5.1f\%) longer than flex_row_col_max\n",
		count_large_rows,
		((double) count_large_rows*100.00)/( (double) row_nr) );
      }
    /* warn if any rows longer than 2*flex_row_col_max */
    if (count_extra_large_rows> 0 )
      {
	fprintf(stderr,"sparse_convert_compressed_row: "
		"warning, %"dFIDX"rows (%5.1f\%) longer than 2*flex_row_col_max\n",
		count_extra_large_rows,
		((double) count_extra_large_rows*100.00)/( (double) row_nr) );
      }

    /* now we know the number of entries in each row and in total,
       allocate the cols vector and the A vector */
    TRY_MALLOC( Acols, Arows[row_nr], FIDX, sparse_convert_compressed_row);
    TRY_MALLOC( A, Arows[row_nr], double, sparse_convert_compressed_row);
  } /* end omp master */

#pragma omp barrier

  /* allocate memory for a sorter for the columns of any row */
  TRY_MALLOC(sorter, max_cols, struct intdouble,
	     sparse_convert_compressed_row);
  /* corresponding columns and data */
  TRY_MALLOC(this_cols, max_cols, FIDX, sparse_convert_compressed_row);
  TRY_MALLOC(this_data, max_cols, double, sparse_convert_compressed_row);
  
  /* copy the entries of the matrix */
  /* loop over all rows */
#pragma omp for schedule(static)
  for (i=0; i<row_nr; i++)
    {
      FIDX row_start;
      struct sparse_flex_col_block *block;
      FIDX col_max=(*K).flex_row_col_max;

      /* sort the columns of this row */
      block=&(*K).flex_row_cols[i];
      for (j=0; j<(*K).flex_row_col_nr[i]; j++)
	{
	  FIDX col= (*block).cols_idx[j%col_max];

	  sorter[j].i=j;
	  sorter[j].d=-col; 	  /* (-col) because of descending sort */

	  this_cols[j]=col;
	  this_data[j]=(*block).cols_data[j%col_max];

	  /* at end of block */
	  if ( (j%col_max)==col_max-1 )
	    {
	      /* switch to next block */
	      block=(*block).next;
	    }
	}
      qsort( &sorter[1], (*K).flex_row_col_nr[i]-1, sizeof(struct intdouble), 
	     comp_intdouble_d); 

      row_start=Arows[i];
      /* loop over the sorted columns of this row */
      for (j=0; j<(*K).flex_row_col_nr[i]; j++)
	{
	  FIDX sorted_j=sorter[j].i;
	  Acols[row_start+j] = this_cols[sorted_j];
	  A[row_start+j]     = this_data[sorted_j];
	}
      /* end of this row */
    }

  /* free the sorter, this_cols, this_data */
  free(this_data);
  free(this_cols);
  free(sorter);

#pragma omp barrier
  } /* end omp parallel */

  { /* serial block */
    FIDX i,j;

    /* now we store the pointers in the matrix struct */
    (*K).rows = Arows;
    (*K).cols = Acols;
    (*K).A    = A;

    /* switch the type */
    (*K).type = SP_TYPE_COMPROW;

    /* free the now redundant parts of the matrix struct */
    for (i=0; i<(*K).row_nr; i++)
      {
	free( (*K).flex_row_cols[i].cols_idx );
	free( (*K).flex_row_cols[i].cols_data );
      }
    for (i=0; i<(*K).nr_processors; i++)
      {
	struct sparse_flex_reserve_list *this, *head, *next;
	head=&(*K).flex_reserve[i];
	this=head;
	while (this!=NULL)
	  {
	    FIDX nr_reserve_blocks = (*this).max;
	    
	    for (j=0; j<nr_reserve_blocks; j++)
	      {
		struct sparse_flex_col_block *block;
		block=&(*this).blocks[j];
		free( (*block).cols_idx );
		free( (*block).cols_data );
	      }
	    free( (*this).blocks );
	    
	    next=(*this).next;
	    if (this!=head)
	      {
		free(this);
	      }
	    this=next;
	  }
      }
    free((*K).flex_row_cols);
    free((*K).flex_row_col_nr);
    (*K).flex_row_col_max=0;
    (*K).flex_row_col_nr  = NULL;
    (*K).flex_row_cols    = NULL;
    (*K).nr_processors    = 0;
    free( (*K).flex_reserve );
    (*K).flex_reserve     = NULL;
    

    /*fprintf(stderr,"sparse_convert_compressed_row DEBUG: "
      "avg cols/row=%8.2e\n",
      (1.0*(*K).rows[(*K).row_nr])/(1.0*(*K).row_nr));/**/
  } /* end serial block */

  /* DD_rows and other DD_* stuff is to be left as is */

  return SUCCESS;
}



/*FUNCTION*/
int sparse_color_dofs(struct sparse *K
/* colors the dofs of a matrix to allow parallel GS-smoother

   In/Out:  K      - sparse matrix, coloring data ist added

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
				  ){
  FIDX i, j, endrow, row_nr;

  FIDX max_col, this_cols, this, check_if_colored;

  FIDX *color;

  struct ilist *new_cols_first, *new_cols_last;

  row_nr=K->row_nr;

  /* how to test this ????
     if (row_nr!=K->col_max)
     {
     fprintf(stderr, "sprase_color_dofs: require row_nr==col_nr\n");
     return FAIL;
     } */ 
  
  TRY_MALLOC(color, row_nr, FIDX, sprase_color_dofs);

  /* initialise color for each dof */
  for (i=0; i<row_nr; i++)
    { 
      color[i]=-1;
    }

  /* initialise list to be empty */
  new_cols_first =NULL;
  new_cols_last  =NULL;

  /* outer loop, check each node if it has already been colored */
  for (check_if_colored=0; check_if_colored<row_nr; check_if_colored++)
    {
#ifdef FEINS_have_warning 
#warning "sparse_color_dofs: not complete yet"
#endif 
    }

  return SUCCESS;
}




/*FUNCTION*/
int sparse_extract_invdiag(struct sparse *K, struct vector *Kinvdiag
/* the inverse of the diagonal of K is stored in the vector Kinvdiag

   Input:   K      - sparse matrix

   Output:  Kinvdiag
                   - (by reference) if size does not fit K, the vector
                     is reinitialised, on exit i-th component of
                     Kinvdiag is set to 1/(K_ii),
		     

   Return: SUCCESS - success,
           FAIL    - failure
*/
			   ){
  int  err;
  FIDX i;
  double *entry;

  if (K->row_nr != Kinvdiag->len)
    {
      vector_free( Kinvdiag);
      err=vector_alloc( Kinvdiag, K->row_nr);
      FUNCTION_FAILURE_HANDLE(err, vector_alloc, sparse_extract_invdiag);
    }

  /* get each diag entry, store the inverse */
  for (i=0; i<K->row_nr; i++)
    {
      err=sparse_get_entry( K, i, i, 0, &entry);
      FUNCTION_FAILURE_HANDLE(err, sparse_get_entry,
			      sparse_extract_invdiag);

      if ((*entry)!=0.0)
	{
	  Kinvdiag->V[i]= 1.0 / ( (*entry) );
	}
      else
	{
	  fprintf(stderr, "sparse_extract_invdiag: K has zero diagonal "
		  "entry! entry %"dFIDX"\n",  i);
	  return FAIL;
	}
    }

  return SUCCESS;
}















/*FUNCTION*/ 
void vector_init(struct vector *V 
/* V initialised to be empty

   Output: V       - (V is given by reference) defined to be empty
*/
		  ){
  V->V       = NULL;

  V->len   = 0 ;
  V->n_max = 0 ;
}

/*FUNCTION*/
int vector_alloc(struct vector *V, FIDX len
/* allocates memory in V 
   
   Input:  len     - length of the vector

   Output: V       - (V is given by reference), memory in V is
                     allocated, dimensions of V initialised

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
		 ){
  TRY_MALLOC( V->V, len, double, vector_alloc);

  
  V->len   = len ;
  V->n_max = len ;

  return SUCCESS;
}

/*FUNCTION*/
void vector_free(struct vector *V
/* memory allocated in V is freed, so that V can be freed

   Output: V       - (V is given by reference) inside memory is
                     released 
*/
		  ){
  free(V->V);
  V->V       = NULL;

  V->len   = 0 ;
  V->n_max = 0 ;
}



/*FUNCTION*/
int vector_random(struct vector *vec, int *seed
/* fills the vector with (pseudo) random numbers of normal (0,1)
   distribution as generated by LAPACK routine DLARNV
      
   In/Out: seed    - array of seed values, length 4,
                     from documentation of DLARNV:
                     on entry, the seed of the random number generator;
		     the array elements must be between 0 and 4095,
		     and seed[3] must be odd;
		     on exit, the seed is updated.

   Output: vec     - vector, the contents are assigned 

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
			  ){
  FIDX i;
  int f_len;
  int f_three=3;

  for (i=0; i<4; i++)
    {
      if ((seed[i]<0)||(seed[i]>4095))
	{
	  fprintf(stderr, "vector_random: "
		  "illegal seed value, seed[%"dFIDX"]=%d\n", i, seed[i]);
	  return FAIL;
	}
    }
  if (seed[3]%2==0)
    {
      fprintf(stderr, "vector_random: "
	      "seed[3]=must be odd, got seed[3]=%d\n", seed[3]);
      return FAIL;
    }

  
  if ((*vec).len<0)
    {
      fprintf(stderr, "vector_random: "
	      "vec.len must be >=0\n");
      return FAIL;
    }
  
  f_len=(*vec).len;
  
  dlarnv_(&f_three, seed, &f_len, (*vec).V);

  return SUCCESS;
}



/*FUNCTION*/
int vector_diag_scale(void *notused, struct vector *in,
		      void *arg3, struct vector *out
/* performs
     out = D*in
   where D is a diagonal matrix with entries as specified in the
   vector diag, 

   Input:  notused - well, it is not used but included in the
                     interface to allow this function to be used as a
                     preconditioner (i.e. if diag is the inverse of
                     the diagonal of a matrix, this is the Jacobi
                     preconditioner)
           in      - input vector
	   arg3=
	   diag    - vector storing the diagonal entries of D,

   Output: out    - (given by reference), the D*in

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
		 ){
  FIDX i;

  struct vector *diag;
  diag=arg3;

  if ((in->len != out->len)||(in->len != diag->len))
    {
      fprintf(stderr, "vector_diag_scale: vector sizes don't match!\n");
      fprintf(stderr, "diag.len=%"dFIDX", vec.len=%"dFIDX"!\n",
	       diag->len,  in->len);
      return FAIL;
    }

  for (i=0; i< in->len; i++)
    {
      out->V[i] = diag->V[i] * in->V[i];
    }

  return SUCCESS;
}

/*FUNCTION*/
int vector_read_file(struct vector *vec, char *name
/* reads the contents of the file into the vector, memory is allocated
   for that
      
   Input:  name    - name of the file

   Output: vec     - vector, memory is allocated and the contents
                     assigned 

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
			  ){
  int  err;
  FIDX i, len;
  long fstart;
  double dhlp;

  FILE *in;

 /* open the file */
  in=fopen(name, "r");
  if (in==NULL)
    {
      fprintf(stderr, "vector_read_file: "
	      "error opening file \"%s\"\n", name);
      return FAIL;
    }

  /* save the position to fstart */
  fstart = ftell(in);

  /* find out the length of the vector */
  len=0;
  while (fscanf(in,"%lg",&dhlp)==1) len++;

  /* rewind to actually read the vector */
  fseek(in, fstart, SEEK_SET);

  /* allocate memory */
  TRY_MALLOC( vec->V, len, double, vector_read_file);

  /* loop over all rows */
  for (i=0; i<len; i++)
    {
      err=fscanf(in, "%lg", &vec->V[i]);
      if (err!=1) err=FAIL; else err=SUCCESS;
      FUNCTION_FAILURE_HANDLE(err, fscanf, vector_read_file);
    }

  /* set the vector length */
  vec->len   = len;
  vec->n_max = len;

  /* close the file */
  fclose(in);

  return SUCCESS;
}







/*FUNCTION*/
int vector_write_file(struct vector *vec, char *name
/* writes the contents of the vector to a file with the specified name
      
   Input:  vec     - vector
           name    - name for the file

   Output: (the file)

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
			  ){
  int  err;
  FIDX i, len;

  FILE *out;

  len= vec->len;

 /* open the file */
  out=fopen(name, "w");
  if (out==NULL)
    {
      fprintf(stderr, "vector_write_file: "
	      "error opening file \"%s\"\n", name);
      return FAIL;
    }

  /* loop over all rows */
  for (i=0; i<len; i++)
    {
      err=fprintf(out, "%+24.16e\n", vec->V[i]);
      if (err<0) err=FAIL; else err=SUCCESS;
      FUNCTION_FAILURE_HANDLE(err, fprintf, vector_write_file);
    }

  /* close the file */
  fclose(out);

  return SUCCESS;
}


/*FUNCTION*/
int vector_n_write_file(FIDX nCol, struct vector *vec, char *name
/* writes the contents of an array of column vectors to a file with 
   the specified name
      
   Input:  nCol    - number of vectors in the array
           vec     - array of vectors, all vec[j] have to be of same
                     length j=0..nCol-1, the file will be written as a
                     matrix with nCol columns and vec[0].len rows,
           name    - name for the file

   Output: (the file)

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
			  ){
  int  err;
  FIDX i, j, len;

  FILE *out;

  if (nCol>0)
    len= vec[0].len;
  else
    len = 0;
  
  for (j=1; j<nCol; j++)
    {
      if (len!=vec[j].len)
	{
	  fprintf(stderr, "vector_n_write_file: "
		  "all vectors in the array must have same length\n");
	  return FAIL;
 	}
    }

 /* open the file */
  out=fopen(name, "w");
  if (out==NULL)
    {
      fprintf(stderr, "vector_n_write_file: "
	      "error opening file \"%s\"\n", name);
      return FAIL;
    }

  /* loop over all rows */
  for (i=0; i<len; i++)
    {
      for (j=0; j<nCol; j++) /* loop over columns */
	{
	  err=fprintf(out, "%+24.16e   ", vec[j].V[i]);
	  if (err<0) err=FAIL; else err=SUCCESS;
	  FUNCTION_FAILURE_HANDLE(err, fprintf, vector_n_write_file);
	}

      err=fprintf(out, "\n");
      if (err<0) err=FAIL; else err=SUCCESS;
      FUNCTION_FAILURE_HANDLE(err, fprintf, vector_n_write_file);
    }

  /* close the file */
  fclose(out);

  return SUCCESS;
}














/*FUNCTION*/
int projector1_no_precon(void *notused, struct vector *in,
			 void *arg3, struct vector *out
/* performs
     out = P*I*P^T *in
   where P is a projector, defined to set a number of components of
   the vector to zero and not change the others,
   therefore
     P*I*P^T = P 
   in this special case, which is the way how the computation is
   performed 

   Input:  notused - well, it is not used but included in the
                     interface to allow this function to be used as a
                     "preconditioner" 
           in      - input vector
	   arg3=
	   P       - projector data

   Output: out    - (given by reference), the projection of in

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
		 ){
  FIDX i;

  struct projector1 *P;
  P=arg3;

  if (((*in).len!=(*out).len)||((*in).len<(*P).len))
    {
      fprintf(stderr,
	      "projector1_no_precon: dimensions make no sense!\n");
      return FAIL;
    }

  /* copy in to out */
  for (i=0; i<(*in).len; i++)
    {
      (*out).V[i]=(*in).V[i];
    }

  /* apply the projector */
  for (i=0; i<(*P).len; i++)
    {
      (*out).V[ (*P).V[i] ]   = 0.0 ;
    }

  return SUCCESS;
}

/*FUNCTION*/
int projector1_alloc(struct projector1 *V, FIDX len
/* allocates memory in V 
   
   Input:  len     - length of the projector1

   Output: V       - (V is given by reference), memory in V is
                     allocated, dimensions of V initialised

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
		 ){
  TRY_MALLOC( (*V).V, len, FIDX, projector1_alloc);

  
  (*V).len   = 0   ;
  (*V).n_max = len ;

  return SUCCESS;
}

/*FUNCTION*/
void projector1_free(struct projector1 *V
/* memory allocated in V is freed, so that V can be freed

   Output: V       - (V is given by reference) inside memory is
                     released 
*/
		  ){
  free((*V).V);

  (*V).len   = 0 ;
  (*V).n_max = 0 ;
}








/*FUNCTION*/
int projectorMorthoZ_no_precon(void *notused, struct vector *in,
			 void *arg3, struct vector *out
/* performs
     out = P*in
   where P is a M-orthogonal projector, such that the result is
   M-orthogonal to all vectors in Z,
   it is safe to have in==out

   Input:  notused - well, it is not used but included in the
                     interface to allow this function to be used as a
                     "preconditioner" 
           in      - input vector
	   arg3=
	   P       - projector data

   Output: out    - (given by reference), the projection of in

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
		 ){
  FIDX i,k, len;
  int err;

  struct projectorMorthoZ *P;
  P=arg3;
  
  if (((*in).len!=(*out).len)||((*in).len!=(*(*P).M).row_nr))
    {
      fprintf(stderr,
	      "projectorMorthoZ_no_precon: dimensions make no sense!\n");
      return FAIL;
    }
  len = (*in).len;

  /* copy in to out */
  if (in!=out)
    {
      for (i=0; i<(*in).len; i++)
	{
	  (*out).V[i]=(*in).V[i];
	}
    }

  /* apply the projector */
  for (k=0; k<(*P).num; k++)
    {
      double scale = 0.0;

      /* compute scale = y^T M x = (y^T M x)/(x^T M x) */
      err=sparse_mul_mat_vec( (*P).M, out, &(*P).My );
      FUNCTION_FAILURE_HANDLE( err, sparse_mul_mat_vec, projectorMorthoZ_no_precon);
      for (i=0; i<len; i++)
	{
	  scale += (*P).My.V[i]* (*(*P).Z)[k].V[i] ;
	}

      /* add -scale*Z[k] */
      for (i=0; i<len; i++)
	{
	  (*out).V[ i ]  -= scale * (*(*P).Z)[k].V[i] ;
	}
    }

  return SUCCESS;
}

/*FUNCTION*/
int projectorMorthoZ_init(struct projectorMorthoZ *P, FIDX num,
			  struct vector **Z, struct sparse *M
/* allocates memory in V 
   
   Input:  num     - number of vectors Z
           M       - Mass matrix, a reference is stored in P,
                     P should be destroyed before M

   In/Out: Z       - array of num vectors to which P shall 
                     M-ortho-project,
		     the vectors are ortho-normalised, then the
		     pointer to them is stored (without copy),
		     (Z will be freed with P)

   Output: P       - (given by reference), all data is initialised


   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
			  ){
  FIDX i,j,k, row_nr;
  int err;

  (*P).num = -1;
  row_nr = (*M).row_nr;

  err=vector_alloc( &(*P).My, row_nr );
  FUNCTION_FAILURE_HANDLE( err, vector_alloc, projectorMorthoZ_init);

  for (k=0; k<num; k++)
    {

      if ((*Z)[k].len!=row_nr)
	{
	  fprintf(stderr,
		  "projectorMorthoZ_init: dimensions of Z[%"dFIDX"] and M "
		  "don't match!\n",k);
	  return FAIL;
	}

      /* M-orthogonalise to previous vectors */
      for (j=0; j<k; j++)
	{
	  double scale=0.0;

	  /* compute scale = y^T M x = (y^T M x)/(x^T M x) */
	  err=sparse_mul_mat_vec( M, &(*Z)[k], &(*P).My );
	  FUNCTION_FAILURE_HANDLE( err, sparse_mul_mat_vec, projectorMorthoZ_init);

	  for (i=0; i<row_nr; i++)
	    {
	      scale += (*P).My.V[i]* ((*Z)[j]).V[i] ;
	    }

	  /* add -scale*Z[j] */
	  for (i=0; i<row_nr; i++)
	    {
	      ((*Z)[k]).V[ i ]  -= scale *((*Z)[j]).V[i] ;
	    }
	}

      /* M-normalise Z[k] */
      {
	double scale=0.0;

	/* compute scale = y^T M x = (y^T M x)/(x^T M x) */
	err=sparse_mul_mat_vec( M, &(*Z)[k], &(*P).My );
	FUNCTION_FAILURE_HANDLE( err, sparse_mul_mat_vec, projectorMorthoZ_init);

	for (i=0; i<row_nr; i++)
	  {
	    scale += (*P).My.V[i]* ((*Z)[k]).V[i] ;
	  }
	if (scale==0.0)
	  {
	    fprintf(stderr,
		    "projectorMorthoZ_init: M-norm of Z[%"dFIDX"]==0.0 !"
		    "\n",k);
	    return FAIL;
	  }

	scale=1.0/sqrt(scale);

	/* multiply Z[k] *=scale */
	for (i=0; i<row_nr; i++)
	  {
	    ((*Z)[k]).V[ i ]  *= scale;
	  }
      } /* end M-normalise Z[k] */
    } /* for k */
  
  (*P).num = num;
  (*P).M   = M;
  (*P).Z   = Z;

  return SUCCESS;
}

/*FUNCTION*/
void projectorMorthoZ_free(struct projectorMorthoZ *P
/* memory allocated in P is freed, so that P can be freed

   Output: P       - (P is given by reference) inside memory is
                     released 
*/
		  ){
  FIDX k;

  for (k=0; k<(*P).num; k++)
    {
      vector_free( &(*(*P).Z)[k] );
    }
  free(*(*P).Z);
  free((*P).Z);
  vector_free( &(*P).My );

  (*P).num=-1;

  (*P).M=NULL;
}










/*FUNCTION*/
void int_double_list_free( struct int_double_list *root
/* free all enties of a list given by the root entry
   
   Output: root    - pointer to an entry, this entry and all
                     successors will be freed
*/
			   ){
  struct int_double_list *current, *next;

  current = root;
  while (current!=NULL)
    {
      next=current->succ;
      free(current);
      current=next;
    }
  return;
}





/*FUNCTION*/
void coarse_mat_free( struct coarse_mat *cmat
/* the internal parts of cmat are freed, so that cmat itself can safely
   be freed
   
   Output: cmat    - (given by reference) on output it will be empty
*/
		    ){
  free((*cmat).nodes);
  (*cmat).nodes=NULL;

  free((*cmat).A);
  (*cmat).A=NULL;

  free((*cmat).help);
  (*cmat).help=NULL;

  free((*cmat).ipiv);
  (*cmat).ipiv=NULL;

  (*cmat).nr     = 0;
  (*cmat).band_w = 0;
  (*cmat).symm   = 0;

  if ( (*cmat).HYPREamgIsInit == 1)
    {
      /* use clean up for HypreAMG version */
#ifdef HAVE_HYPRE_NO_MPI
      gen_hypre_AMG_free(&(*cmat).AMGdata);
#endif
      (*cmat).HYPREamgIsInit = 0;
    } /* end HypreAMG */

  if ( (*cmat).UMFPACKisinit == 1)
    {
      /* use clean up for UMFPACK version */
#ifdef HAVE_UMFPACK
      sparse_solve_UMFPACK_reuse(NULL, NULL, NULL,
				     &(*cmat).P, 
				     &(*cmat).UMFPACKdata,
				     &(*cmat).UMFPACKisdiri,
				     &(*cmat).UMFPACKhelp, 9);
      /* no error handling here */
      projector1_free(&(*cmat).P);
#endif
      (*cmat).UMFPACKisinit = 0;
    } /* end UMFPACK */
}



/*FUNCTION*/
void coarse_mat_null_def( struct coarse_mat *cmat
/* the internal parts of cmat are defined, so that cmat itself can
   be given to coarse_mat_free even though it has not been set 
   
   Output: cmat    - (given by reference) on output it will be empty
*/
		    ){
  (*cmat).nodes=NULL;
  (*cmat).A=NULL;
  (*cmat).help=NULL;
  (*cmat).ipiv=NULL;

  (*cmat).nr     = 0;
  (*cmat).band_w = 0;
  (*cmat).symm   = 0;

  (*cmat).HYPREamgIsInit=0;
  (*cmat).UMFPACKisinit =0;
}



/*FUNCTION*/
int coarse_mat_set( struct sparse *K, FIDX nr_diri, FIDX *diris,
		    int symm, struct coarse_mat *cmat
/* computes the triangular factorisation of the stiffness matrix K
   and stores it in cmat, such that this is suitable for the coarse
   grid solver
   
   Input:  K       - the stiffness matrix for the coarse grid
           nr_diri - number of degrees of freedom (DOFs) which are
	             subject to Dirichlet boundary conditions, such
	             that the according rows and columns of the
	             matrix have to be deleted 
           diris   - vector of length nr_diri, stating which DOFs
                     these are
	   symm    - marking the symmetry properties of the matrix,
	             if the matrix is known to be symmetric positive
	             definit, use symm=1 (so more efficient methods
	             can be used), 
		     if the matrix is known to have symmetric
		     connectivity but the matrix is not necessarily
		     symmetric, use symm=2 

   Output: cmat    - (given by reference) on output it will contain
                     the data for the coarse grid solver, internal
                     fields are initialised and memory allocated

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
		    ){
  int  err;
  FIDX i, j, k, LDA, endrow, row, col;
  FIDX   *Arows, *Acols;

  FIDX *xadj, *adjncy;
                 /* the connection table */
  FIDX *perm, *iperm;
                 /* vectors defining permutations of the node
		    numbers */
  FIDX dofs;     /* number of DOFs  */

  FIDX bandw;    /* the bandwidth of the matrix */
  int  i_bandw;  /* integer copy of bandw */

  /* some vars for Fortran calls */
  char uplo;
  int fdofs, fLDA;

  coarse_mat_null_def(cmat);

  if ( (*K).type == SP_TYPE_FLEX )
    {
      /*fprintf( stderr, "WARNING: sparse_mul_mat_vec_add_trans, \n"
	       "coverting matrix from flexible to compressed "
	       "row format \n\n"); */
      err = sparse_convert_compressed_row( K );
      FUNCTION_FAILURE_HANDLE( err, sparse_convert_compressed_row, 
			       coarse_mat_set );
    }
  /* if we have hypre AMG, use it */
  /*#ifdef HAVE_HYPRE_NO_MPI
    err=sparse_solve_HypreAMG_cmat_set(K, nr_diri, diris, symm, cmat);
    FUNCTION_FAILURE_HANDLE( err,  sparse_solve_HypreAMG_cmat_set,
    coarse_mat_set );

    (*cmat).nr     = (*K).row_nr;
    (*cmat).UMFPACKisinit=0;
    #else
  */
  /* no hypre AMG */
#ifdef HAVE_UMFPACK
  /* if we have UMFPACK, use it */
  err=sparse_solve_UMFPACK_cmat_set(K, nr_diri, diris, symm, cmat);
  FUNCTION_FAILURE_HANDLE( err,  sparse_solve_UMFPACK_cmat_set,
                           coarse_mat_set );

  (*cmat).nr     = (*K).row_nr;
  (*cmat).HYPREamgIsInit=0;
#else
  /* no hypre AMG, no UMFPACK */
  (*cmat).HYPREamgIsInit=0;
  (*cmat).UMFPACKisinit=0;

  fprintf(stderr,"coarse_mat_set: warning, UMFPACK not available, "
          "using LAPACK band-matrix-solver, this is slower\n");

  Arows   = (*K).rows;
  Acols   = (*K).cols;


  dofs= (*K).row_nr;
  /* printf("dof %"dFIDX"\n",dofs); */
  /* allocate memory for the local data */
  TRY_MALLOC( xadj, (dofs+1), FIDX, coarse_mat_set);
  TRY_MALLOC( adjncy, Arows[dofs], FIDX, coarse_mat_set);

  TRY_MALLOC( perm, dofs, FIDX, coarse_mat_set);
  TRY_MALLOC( iperm, dofs, FIDX, coarse_mat_set);

  

  /* build up the connection table for the reordering */
  bandw=0;
  xadj[0]=0;
  for (i=0; i<dofs; i++)
    {

      xadj[i+1]=xadj[i];
      endrow = Arows[i+1];
      for (j=Arows[i]; j<endrow; j++)
	{
	  col=Acols[j];

	  if ( col !=i ) /* full adj list */
	    {
	      /* this is a new non-diagonal entry of K */
	      adjncy[xadj[i+1]]=col;
	      if (abs(col-i)>bandw) 
		{
		  bandw=abs(col-i);
		}
	      xadj[i+1]++;
	    }
	}
    }

  /* find a bandwidth reducing permutation of the DOFs */
  err=bandw_red_perm( dofs, xadj, adjncy, perm, iperm, &bandw);
  FUNCTION_FAILURE_HANDLE(err, bandw_red_perm, coarse_mat_set);

#ifdef DEBUGFEINS
  /* check permutation complete  and if bandwidth is true */
  for (i=0; i<dofs; i++) xadj[i]=0;
  for (i=0; i<dofs; i++) xadj[perm[i]]=1;
  for (i=0; i<dofs; i++) 
    {
      if ((xadj[i]!=1)||(perm[iperm[i]]!=i))
	{
	  fprintf( stderr, "coarse_mat_set: bandw_reduction failed, "
		   "permutation incomplete or inverse wrong\n");
	  return FAIL;
	}
    }

  xadj[0]=0; /* test bandw */
  for (i=0; i<dofs; i++)
    {
      /* loop over the collumns of this row */
      endrow = Arows[i+1];
      for (j=Arows[i]; j<endrow; j++)
	{
	  k=Acols[j];

	  row=perm[i];
	  col=perm[k];
	      
	  if (abs(row-col) > xadj[0])
	    {
	      xadj[0]=abs(row-col);
	    }
	}
    }
  if (xadj[0]!=bandw)
    {
      fprintf(stderr, "coarse_mat_set: bandw_reduction failed\n"
	      "real bandwidth=%"dFIDX"     claimed=%"dFIDX"\n",  xadj[0], bandw);
      return FAIL;
    }
#endif


  /*free some unneeded memory */
  free(xadj);
  free(adjncy);


  /* required to work: (apparently lapack has a different
     understanding of bandwidth) */ 
  bandw++;

  /* the required storage size depends on symm: */
  switch(symm)
    {
    case 1:
      LDA=bandw+1;
      break;
    case 2:
      LDA=3*bandw+1;
      break;
    default:
      fprintf(stderr, "coarse_mat_set: unknown symm=%d\n",
	      symm);
      return FAIL;
    }

  /* allocate memory for the band matrix and the other stuff in mat */
  TRY_MALLOC( (*cmat).A, dofs*LDA, double, coarse_mat_set);
  TRY_MALLOC( (*cmat).help, dofs, double,  coarse_mat_set);
  TRY_MALLOC( (*cmat).nodes, dofs, FIDX,    coarse_mat_set);
  if (symm==2)
    {
      TRY_MALLOC( (*cmat).ipiv, dofs, int, coarse_mat_set);
    }
  else (*cmat).ipiv=NULL;

  (*cmat).nr     = dofs;
  (*cmat).band_w = bandw;
  (*cmat).symm   = symm;
  
  /* clear the band matrix */
  for (i=0; i<dofs*LDA; i++) (*cmat).A[i]=0.0;

  /* build up the band matrix and the nodes field, dependent on symm: */
  if (symm==1)
    {
      for (i=0; i<dofs; i++)
	{
	  /* loop over the collumns of this row */
	  endrow = Arows[i+1];
	  for (j=Arows[i]; j<endrow; j++)
	    {
	      k=Acols[j];

	      row=perm[i];
	      col=perm[k];
		  
#ifdef DEBUGFEINS
	      if (abs(row-col)>bandw)
		{
		  fprintf(stderr, "coarse_mat_set: bandw=%"dFIDX" "
			  "seems to be wrong: row=%"dFIDX" col=%"dFIDX"  "
			  "symm=%d\n", bandw,
			   row,  col, symm);
		  return FAIL;
		}
#endif

	      if (row<=col) 
		{
		  /* this is an upper triangular entry, put it in
		     cmat.A */
		  (*cmat).A[col*LDA+bandw+row-col]=
		    (*K).A[j];
		}
	    }
	  /* build cmat.nodes */
	  (*cmat).nodes[i]=iperm[i];
	}
      /* replace the dirichlet rows and columns by unity */
      for (i=0; i<nr_diri; i++)
	{
	  FIDX maxcol, minrow;
	  
	  row=perm[diris[i]];
	  
	  maxcol=row+bandw;
	  if (maxcol>=dofs) maxcol=dofs-1;
	  for (col=row+1; col<=maxcol; col++)
	    {
	      (*cmat).A[col*LDA+bandw+row-col]=0.0;
	    }
	  
	  col=row;
	  
	  minrow=col-bandw;
	  if (minrow<0) minrow=0;
	  for (row=minrow; row<col; row++)
	    {
	      (*cmat).A[col*LDA+bandw+row-col]=0.0;
	    }

	  row=col;
	  (*cmat).A[col*LDA+bandw+row-col]=1.0;
	}
      uplo='U';
      fdofs =  dofs;
      fLDA  =  LDA;
      i_bandw= (int) bandw;
      if ((FIDX)i_bandw!= bandw)
	{
	  fprintf(stderr, 
		  "coarse_mat_set: integer overflow in bandw\n");
	  return FAIL;
	}

      dpbtrf_( &uplo, &fdofs, &i_bandw, (*cmat).A, &fLDA, &err); 
      if (err!=0)
	{
	  fprintf(stderr, 
		  "coarse_mat_set: LAPACK routine dpbtrf returned "
		  "with info=%d\n",
		  err);
	  return FAIL;
	}
    } /* end symm==1 */
  else if (symm==2)
    {
      for (i=0; i<dofs; i++)
	{
	  /* loop over the collumns of this row */
	  endrow = Arows[i+1];
	  for (j=Arows[i]; j<endrow; j++)
	    {
	      k=Acols[j];

	      row=perm[i];
	      col=perm[k];
		  
#ifdef DEBUGFEINS
	      if (abs(row-col)>bandw)
		{
		  fprintf(stderr, "coarse_mat_set: bandw=%"dFIDX" "
			  "seems to be wrong: row=%"dFIDX" col=%"dFIDX"  "
			  "symm=%d\n", bandw, 
			   row,  col, symm);
		  return FAIL;
		}
#endif
		  
	      /* put it in cmat.A */
	      (*cmat).A[col*LDA+2*bandw+row-col]=
		(*K).A[j];
	    }
	  /* build cmat.nodes */
	  (*cmat).nodes[i]=iperm[i];
	}
      /* replace the dirichlet rows by unity, (not symmetric ==> no
	 need to replace the collumns) */
      for (i=0; i<nr_diri; i++)
	{
	  FIDX maxi, mini;
	  
	  row=perm[diris[i]];
	  
	  maxi=row+bandw;
	  mini=row-bandw;
	  if (maxi>=dofs) maxi=dofs-1;
	  if (mini<0) mini=0;
	  for (col=mini; col<=maxi; col++)
	    {
	      (*cmat).A[col*LDA+2*bandw+row-col]=0.0;
	    }
	  
	  col=row;
	  (*cmat).A[col*LDA+2*bandw+row-col]=1.0;
	}
      fdofs =  dofs;
      fLDA  =  LDA;
      i_bandw= (int) bandw;
      dgbtrf_( &fdofs, &fdofs, &i_bandw, &i_bandw, (*cmat).A, &fLDA,
	       (*cmat).ipiv, &err); 
      if (err!=0)
	{
	  fprintf(stderr, 
		  "coarse_mat_set: LAPACK routine dgbtrf returned "
		  "with info=%d\n",
		  err);
	  return FAIL;
	}
    } /* end symm==2 */
  else
    {
      fprintf(stderr, "coarse_mat_set: unknown symm=%d\n",
	      symm);
      return FAIL;
    }

  /* free local data */
  free(perm);
  free(iperm);

  /* end of #ifdef HAVE_UMFPACK, #else */
#endif
  /* end of #ifdef HAVE_HYPRE_NO_MPI, #else */
  /*#endif*/

  return SUCCESS;
}


#ifdef HAVE_HYPRE_NO_MPI
/*FUNCTION*/
int sparse_solve_HypreAMG_cmat_set( struct sparse *K, 
	FIDX nr_diri, FIDX *diris,
	int symm, struct coarse_mat *cmat
/* initialises hypreAMG (Algebraic Multi Grid) for the stiffness matrix K
   and stores its data in cmat, such that this is suitable for the coarse
   grid solver, HypreAMG versions of coarse_mat_set
   
   Input:  K       - the stiffness matrix for the coarse grid
           nr_diri - number of degrees of freedom (DOFs) which are
	             subject to Dirichlet boundary conditions, such
	             that the according rows and columns of the
	             matrix have to be deleted 
           diris   - vector of length nr_diri, stating which DOFs
                     these are
	   symm    - marking the symmetry properties of the matrix,
	             if the matrix is known to be symmetric positive
	             definit, use symm=1 (so more efficient methods
	             can be used), 
		     if the matrix is known to have symmetric
		     connectivity but the matrix is not necessarily
		     symmetric, use symm=2 

   Output: cmat    - (given by reference) on output it will contain
                     the data for the coarse grid solver, internal
                     fields are initialised and memory allocated

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
	){
   int err;
   int symm_AMG;
   switch (symm)
     {
     case 1:
       symm_AMG=1;
       break;
     case 2:
       symm_AMG=0;
       break;
     default:
       fprintf(stderr,"sparse_solve_HypreAMG_cmat_set: "
	       "wrong symm parameter\n");
       return FAIL;
     }

   err=gen_hypre_AMG_setup(K, nr_diri, diris, 1, 
			   &(*cmat).AMGdata, symm_AMG );
   FUNCTION_FAILURE_HANDLE( err, gen_hypre_AMG_setup,
			    sparse_solve_HypreAMG_cmat_set);

   (*cmat).HYPREamgIsInit = 1;

   return SUCCESS;
}
#endif /* HAVE_HYPRE_NO_MPI */

#ifdef HAVE_UMFPACK
/*FUNCTION*/
int sparse_solve_UMFPACK_cmat_set( struct sparse *K, 
	FIDX nr_diri, FIDX *diris,
	int symm, struct coarse_mat *cmat
/* computes the triangular factorisation of the stiffness matrix K
   and stores it in cmat, such that this is suitable for the coarse
   grid solver, UMFPACK versions of coarse_mat_set
   
   Input:  K       - the stiffness matrix for the coarse grid
           nr_diri - number of degrees of freedom (DOFs) which are
	             subject to Dirichlet boundary conditions, such
	             that the according rows and columns of the
	             matrix have to be deleted 
           diris   - vector of length nr_diri, stating which DOFs
                     these are
	   symm    - marking the symmetry properties of the matrix,
	             if the matrix is known to be symmetric positive
	             definit, use symm=1 (so more efficient methods
	             can be used), 
		     if the matrix is known to have symmetric
		     connectivity but the matrix is not necessarily
		     symmetric, use symm=2 

   Output: cmat    - (given by reference) on output it will contain
                     the data for the coarse grid solver, internal
                     fields are initialised and memory allocated

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
	){
   int err;
   FIDX i, len;
   struct vector tmpX,tmpRHS;

   len=(*K).row_nr;

   /* copy the diris to the projector1 */ 
   err=projector1_alloc(&(*cmat).P, nr_diri);
   FUNCTION_FAILURE_HANDLE( err, projector1_alloc,
	sparse_solve_UMFPACK_cmat_set);
   for (i=0; i<nr_diri; i++)
     {
      (*cmat).P.V[i]=diris[i];
    }
   (*cmat).P.len=nr_diri;

   /* allocate some vectors, fill them with rubbish */
   err=vector_alloc(&tmpX, len);
   FUNCTION_FAILURE_HANDLE( err, vector_alloc,
	sparse_solve_UMFPACK_cmat_set);
   err=vector_alloc(&tmpRHS, len);
   FUNCTION_FAILURE_HANDLE( err, vector_alloc,
	sparse_solve_UMFPACK_cmat_set);
   for (i=0; i<len; i++)
     {
      tmpX.V[i]   = 1.0;
      tmpRHS.V[i] = 2.0;
    }

   err=sparse_solve_UMFPACK_reuse(&tmpX, K, &tmpRHS, &(*cmat).P, 
	&(*cmat).UMFPACKdata, &(*cmat).UMFPACKisdiri,
	&(*cmat).UMFPACKhelp, 1);
   FUNCTION_FAILURE_HANDLE( err, sparse_solve_UMFPACK_reuse,
	sparse_solve_UMFPACK_cmat_set);

   vector_free(&tmpRHS);
   vector_free(&tmpX);

   (*cmat).UMFPACKisinit = 1;

   return SUCCESS;
}
#endif /* HAVE_UMFPACK */

/*FUNCTION*/
int coarse_mat_solve( struct coarse_mat *cmat, enum transposetype trans,
		      struct vector *rhs, struct vector *x
/* solves the equation system 

     A*x = rhs

   where a triangulation of A has been stored in cmat by the routine
   coarse_mat_set
   
   Input:  cmat    - (given by reference) on output it will contain
                     the data for the coarse grid solver, internal
                     fields are initialised and memory allocated
           trans   - ==NoTrans  => normal solve
                     ==Trans    => solve with transposed of the matrix
           rhs     - right hand side of the system, can be the same as
                     rhs, in which case rhs is overwriten with the
                     solution on exit

   Output: x       - solution of the system, can be the same as rhs,
                     in which case rhs is overwriten with the solution
                     on exit

   Return: SUCCESS - success,
           FAIL    - failure, see error message
*/
		      ){
  FIDX i, dof, bandw;
  int LDA, i_bandw, symm, nr, one, info;
  char uplo='U';

  if ( (*cmat).HYPREamgIsInit == 1)
    {
      /* use HypreAMG version */
#ifdef HAVE_HYPRE_NO_MPI
      int err;
      err=gen_hypre_AMG_preco(NULL, rhs, (*cmat).AMGdata, x);
      FUNCTION_FAILURE_HANDLE( err, gen_hypre_AMG_preco,
			       coarse_mat_solve);
      return SUCCESS;
#else
      fprintf(stderr,"coarse_mat_solve: HypreAMGinit error\n");
      return FAIL;
#endif
    } /* end use HypreAMG */



  if ( (*cmat).UMFPACKisinit == 1)
    {
      /* use UMFPACK version */
#ifdef HAVE_UMFPACK
      int err;
      for (i=0; i<(*cmat).P.len; i++)
	{
          (*x).V[(*cmat).P.V[i]]=0.0;
        }
      err=sparse_solve_UMFPACK_reuse(x, NULL, rhs, &(*cmat).P, 
				     &(*cmat).UMFPACKdata,
				     &(*cmat).UMFPACKisdiri,
				     &(*cmat).UMFPACKhelp, 2);
      FUNCTION_FAILURE_HANDLE( err, sparse_solve_UMFPACK_reuse,
			       coarse_mat_solve);

      return SUCCESS;
#else
      fprintf(stderr,"coarse_mat_solve: UMFPACKinit error\n");
      return FAIL;
#endif
    } /* end use UMFPACK */

  nr    = (*cmat).nr;
  bandw = (*cmat).band_w;
  symm  = (*cmat).symm;
  one=1;

  if (((*rhs).len!=nr)||((*x).len!=nr))
    {
      fprintf(stderr,
	      "coarse_mat_solve: sizes of cmat, rhs, x don't match!\n");
      return FAIL;
    }

  /* build the rhs for the LAPACK solver */
  for (i=0; i<nr; i++)
    {
      dof=(*cmat).nodes[i];
      (*cmat).help[i]=(*rhs).V[dof];
    }

  switch(symm)
    {
    case 1:
      LDA=(int) bandw+1;
      i_bandw=(int) bandw;
      /* call LAPACK to do the solve */
      dpbtrs_(&uplo, &nr, &i_bandw, &one, (*cmat).A, &LDA, (*cmat).help,
	      &nr, &info);
      if (info!=0)
	{
	  fprintf(stderr, "coarse_mat_solve: LAPACK "
		  "routine dpbtrs returned with info=%d\n",
		  info);
	  return FAIL;
	}
      break;
    case 2:
      LDA=(int) 3*bandw+1;
      i_bandw=(int) bandw;
      
      if (trans==NoTrans) uplo='N';
      else uplo='T';

      /* call LAPACK to do the solve */
      dgbtrs_(&uplo, &nr, &i_bandw, &i_bandw, &one, (*cmat).A, &LDA,
	      (*cmat).ipiv, (*cmat).help, &nr, &info);
      if (info!=0)
	{
	  fprintf(stderr, "coarse_mat_solve: LAPACK "
		  "routine dgbtrs returned with info=%d\n",
		  info);
	  return FAIL;
	}
      break;
    default:
      fprintf(stderr, "coarse_mat_solve: unknown symm=%d\n",
	      symm);
      return FAIL;
    }
  /* write the solution back to x */
  for (i=0; i<nr; i++)
    {
      dof=(*cmat).nodes[i];
      (*x).V[dof]=(*cmat).help[i];
    }
   
  return SUCCESS;
}  

/*FUNCTION*/
int int_list_alloc( struct int_list *L, FIDX ixnmax, FIDX elnmax
/* allocates memory in the int_list struct for given size
   
   Input:  ixnmax  - maximal number of parts of the list,
           elnmax  - maximal number of elements in the list,

   Output: L       - the list struct, memory is allocated and
                     dimensions initialised

   Return: SUCCESS - success,
           FAIL    - failure, see error message, output will be
                     invalid,
*/
		    ){
  TRY_MALLOC( L->ix, ixnmax+1, FIDX, int_list_alloc);
  TRY_MALLOC( L->el, elnmax, FIDX, int_list_alloc);

  L->ixn=0;
  L->ixnmax=ixnmax;
  L->elnmax=elnmax;

  return SUCCESS;
}

/*FUNCTION*/
void int_list_free( struct int_list *L
/* frees the memory in the int_list struct, so it can be freed
   
   In/Out: L       - the list struct, memory is freed
*/
		    ){
  free(L->ix);
  free(L->el);

  L->ixn=0;
  L->ixnmax=0;
  L->elnmax=0;
}

/*FUNCTION*/
int bandw_red_perm( FIDX nr, FIDX *adix, FIDX *adel,
		    FIDX *perm, FIDX *iperm,
		    FIDX *bandw
/* calls the Gibbs-Poole-Stockmeyer modification of the
   Cuthill-McKee algorithm to produce a bandwith reducing permutaion
   for a given adjency list of a sparce matrix
   
   Input:  nr      - nr of verticex in the graph
           adix    - index of the adjency list
           adel    - element vector of the adjency list (needs to be
                     full adjency list, i.e. contain all edges (i,j)
                     twice: as (i,j) and (j,i) )

   Output: perm    - bandwith reducing permutation, given as integer 
                     vector, (size=nr)
	   iperm   - inverse permutation of perm, given as integer 
                     vector, (size=nr)
           bandw   - resulting bandwidth

   Return: SUCCESS - success,
           FAIL    - failure, see error message, output will be
                     invalid,
*/
		    ){
  int  err;
  FIDX i, j;
  FIDX *ihelp, *npart, tc, nc;
  struct int_list parts, pcon;

  /* quick return? */
  if (nr==0) return SUCCESS;

  /* check if the graph is connected, if not find the connected parts
  */ 
  TRY_MALLOC(ihelp, nr, FIDX, bandw_red_perm);
  for (i=0; i<nr; i++) ihelp[i]=0;
  nc=0;
  for (i=0; i<nr; i++) 
    {
      /* check if this vertex starts a new part */
      if (ihelp[i]==0)
	{
	  /* start a new part, mark all connected nodes recursively */
	  nc++;
	  err=bandw_mark_connected(nc, i, nr, adix, adel, 0, ihelp);
	  FUNCTION_FAILURE_HANDLE(err, bandw_mark_connected,
				  bandw_red_perm);
	}
    }

  /* printf("  test:  nc=%3d, nr=%3d\n", nc, nr);/* */

  err=int_list_alloc(&parts, nc, nr);
  FUNCTION_FAILURE_HANDLE(err, int_list_alloc, bandw_red_perm);
  /* get the number of vertices in each conected subgraph */
  for (i=0; i<nc+1; i++) parts.ix[i]=0;
  for (i=0; i<nr; i++) parts.ix[ihelp[i]]++;
  /* build a index list from that */
  for (i=0; i<nc; i++) parts.ix[i+1]+=parts.ix[i];
  /* build the element list */
  TRY_MALLOC( npart, nc, FIDX, bandw_red_perm);
  for (i=0; i<nc; i++) npart[i]=0;
  for (i=0; i<nr; i++)
    {
      tc=ihelp[i]-1;  /* part numbering starts at 1, index numb. at 0 */
      parts.el[parts.ix[tc]+npart[tc]]=i;
      npart[tc]++;
    }
  free(npart);

#ifdef DEBUGFEINS
  if (nr!=parts.ix[nc])
    {
      fprintf(stderr,"bandw_red_perm: nr(whole)!=sum(nr(parts))\n");
      return FAIL;
    }
#endif


  /* now build a full connectivity list as labeled by the permutation
  */
  err=int_list_alloc(&pcon, nr, 2*adix[nr]);
  FUNCTION_FAILURE_HANDLE(err, int_list_alloc, bandw_red_perm);
  
  /* get the degree of each vertex */
  pcon.ix[0]=0;
  for (i=0; i<nr; i++) pcon.ix[parts.el[i]+1]=adix[i+1]-adix[i];


    /* old for half adj-list: 
       for (i=0; i<nr+1; i++) pcon.ix[i]=0;
       for (i=0; i<nr; i++) 
       for (j=adix[i]; j<adix[i+1]; j++)
       {
       /* the vertex itself counts the degree 
       pcon.ix[parts.el[i]+1]++; 
       /* and the adjacent vertex counts the degree 
       pcon.ix[parts.el[adel[j]]+1]++;
       } */
  /* build a index list from that */
  for (i=0; i<nr; i++) pcon.ix[i+1]+=pcon.ix[i];

#ifdef DEBUGFEINS
  /* if (pcon.ix[nr]!=2*adix[nr]) /* old for half adj-list */
  if (pcon.ix[nr]!=adix[nr])
    {
      fprintf(stderr,"bandw_red_perm: sum(degree) mismatch\n");
      return FAIL;
    }
#endif
 
  /* build the element list the fill counter goes into ihelp */
  for (i=0; i<nr; i++) ihelp[i]=0;
  for (i=0; i<nr; i++)
    for (j=adix[i]; j<adix[i+1]; j++)
      {
	FIDX a,b;
	/* the two vertices of the edge */
	a=parts.el[i]; 
	b=parts.el[adel[j]];

	/* printf("i=%3d  j=%3d   a=%3d  b=%3d   ", i, j, a, b);
	   printf("pcon.ix[a]=%3d   ihelp[a]=%3d   pcon.ix[b]=%3d   "
	   "ihelp[b]=%3d\n", 
	   pcon.ix[a], ihelp[a], pcon.ix[b], ihelp[b]); */
	/* add the two copies of the edge to the full list */
	pcon.el[pcon.ix[a]+ihelp[a]]=b;
	ihelp[a]++;
	/* old for half adj-list: 
	   pcon.el[pcon.ix[b]+ihelp[b]]=a;
	   ihelp[b]++; */
      }

  /* now call the bandwidth reduction for each of the subgraphs, store
     the permutations of the subgraphs in ihelp, get the max of the
     subgraph bandwidths */
  *bandw=0;
  for (i=0; i<nc; i++)
    {
      FIDX sbandw;
      err=bandw_gipost(parts.ix[i], parts.ix[i+1], &pcon,
		       ihelp, iperm, &sbandw);
      FUNCTION_FAILURE_HANDLE(err, bandw_gipost, bandw_red_perm);

      /* overall bandwidth is max of subgraph bandwidths */
      if (sbandw>*bandw) *bandw= sbandw;
    }

  /* the element list parts.el itself forms a permutation, which
     collects the individual subgraphs one after each other, 
     the overall bandwith reducing perutation is the composition of
     parts.el and the permutation now in ihelp */
  for (i=0; i<nr; i++) perm[i]=ihelp[parts.el[i]];

  /* the inverse permutation */
  for (i=0; i<nr; i++) iperm[perm[i]]=i;

  /* done, free local memory and return */
  free(ihelp);
  int_list_free(&parts);
  int_list_free(&pcon);

  return SUCCESS;
}

/*FUNCTION*/
int bandw_gipost( FIDX n1, FIDX n2, struct int_list *ad,
		  FIDX *perm, FIDX *iperm, FIDX *bandw
/* applies the Gibbs-Poole-Stockmeyer modification of the
   Cuthill-McKee algorithm to produce a bandwith reducing permutaion
   for a given full adjency list of a connected subgraph of the
   connectivity graph of a sparce matrix
   
   Input:  n1      - starting position of the connected subgraph
           n2      - end position(+1) of the connected subgraph
           ad      - full adjency list of the whole graph, only the n1
                     to n2-1 parts are read

   Output: perm    - bandwith reducing permutation, the n1 to n2-1
                     entries are written, given as integer vector
           iperm   - reverse of the bandwith reducing permutation,
	             the n1 to n2-1 entries are written, given as
	             integer vector 
           bandw   - resulting bandwidth for this subgraph

   Return: SUCCESS - success,
           FAIL    - failure, see error message, output will be
                     invalid,
*/
		  ){
  int  err;
  FIDX i, j, nc, tmp, length, last, u, v, tv, current;
  FIDX width, widthv, widthu;
  FIDX from, to , dir, lvl;

  FIDX nLi, nLimax; /* Li is going to hold a set of rooted level structures,
		      nLi will mark the number of defined ones in it,
		      nLimax the number of level structs for which
		      memory is allocated in Li
		      Lu and Lv will eventually be the level
		      structures for the endpoints u and v of a pseudo
		      diameter of the graph,
		      LN will be the final level structure which is
		      then used to define the numbering */
  struct int_list **Li, *Lu, *Lv;

  /* integer help vectors */
  FIDX *hint, *hintv, *hintu, *sorter, *Nix, *Lix, *Hix;


  /* test the trivial case, because it breaks stuff */
  if (n2-n1==1)
    {
      /* only one node in this graph, so there is only one permutation
      */

      perm[n1]=n1;
      iperm[n1]=n1;
      (*bandw)=1;

      return SUCCESS;
    }

  /* allocate memory */
  TRY_MALLOC( hint, n2-n1, FIDX, bandw_gipost);
  TRY_MALLOC( hintv, n2-n1, FIDX, bandw_gipost);
  TRY_MALLOC( hintu, n2-n1, FIDX, bandw_gipost);
  nLimax=30;
  TRY_MALLOC( Li, nLimax, struct int_list*, bandw_gipost);
  nLi=0;

  TRY_MALLOC( Lv, 1, struct int_list, bandw_gipost);

  /*************************************************************/
  /*                                                           */
  /* Stage 1: find a pseudo diameter to give starting vertices */
  /*                                                           */
  /*************************************************************/

  /* find a vertex v with min degree */
  tmp=n2-n1; /* init mindegree=max possible degree */
  for (i=n1; i<n2; i++)
    if (ad->ix[i+1]-ad->ix[i]<tmp)
      {
	tmp=ad->ix[i+1]-ad->ix[i];
	v=i;
      }

/*   printf(" test: min degree=%"dFIDX"\n", tmp); */

  err=bandw_level_struct( v, n1, n2, ad, Lv, hint);
  FUNCTION_FAILURE_HANDLE(err, bandw_level_struct, bandw_gipost);

  last=0;
  while (last==0)
    {
      /* number of possible other endpoints of the diameter = number
	 vertices in the highest level of the level struct */
      nLi = Lv->ix[Lv->ixn+1] - Lv->ix[Lv->ixn];
      if (nLi>nLimax)
	{
	  nLimax=nLi;
	  TRY_REALLOC( Li, nLimax, struct int_list*, bandw_gipost);
	}



      /* length of the pseudo diameter */
      length= Lv->ixn;

      /* sort the other possible endpoints by ascending degree */
      TRY_MALLOC(sorter, 2*nLi, FIDX, bandw_gipost);
      for (i=0; i< Lv->ix[Lv->ixn+1] - Lv->ix[Lv->ixn]; i++)
	{
	  FIDX vcandi=Lv->el[ i + Lv->ix[Lv->ixn]];
	  /* in sorter: first degree, then number of the vertex */
	  sorter[2*i+0]=ad->ix[vcandi+1]-ad->ix[vcandi];
	  sorter[2*i+1]=vcandi;
	}
      qsort( sorter, nLi, 2*sizeof(FIDX), comp_fidx);
      
      /* pretent this is the last try */
      last=1;
      /* for each of these build the level structure */
      for(i=0; (i<nLi)&&(last==1); i++) 
	{
	  FIDX vcandi=sorter[2*i+1];
	  TRY_MALLOC( Li[i], 1, struct int_list, bandw_gipost);
	  /* printf(" test:  sorter[%2d]=  [ %3d, %3d ],   ", i, sorter[2*i+0],
	     sorter[2*i+1]); /* */
	  err=bandw_level_struct( vcandi, n1, n2, ad, Li[i], hint);
	  FUNCTION_FAILURE_HANDLE(err, bandw_level_struct, bandw_gipost);

	  /* if the current endpoint gives a larger diameter
	     candidate, reject u, set u=this candidate, start again */
	  if (Li[i]->ixn > length)
	    {
	      int_list_free(Lv);
	      free(Lv);
	      v=vcandi;
	      Lv=Li[i];
	      /* free all prior attempts */
	      for (j=0; j<i; j++)
		{
		  int_list_free(Li[j]);
		  free(Li[j]);
		}
	      free(sorter);
	      /* next try */
	      last=0;
	    }
	}
    }
  /* now v and Lv are fixed, define u to be the root of the Li with
     the smallest width */
  widthu=n2-n1; /* init min(width)=max possible width */
  for (i=0; i<nLi; i++)
    {
      width=0;
      if (Li[i]->ixn == length)
	{
	  for (j=0; j<=length; j++)
	    {
	      if (Li[i]->ix[j+1]-Li[i]->ix[j]>width)
		{
		  width=Li[i]->ix[j+1]-Li[i]->ix[j];
		}
	    }
	  if (width<widthu)
	    {
	      widthu=width;
	      u=i;
	    }
	}
    }
  /* printf("  test: minwidth=%"dFIDX"  for  Li[%"dFIDX"]\n", widthu, u); /* */
  /* by now u is the number of the Li width the smallest width, so
     define Lu and u correctly */
  Lu=Li[u];
  Li[u]=NULL;
  u=sorter[2*u+1];
  /* now forget the remaining Li */
  for (i=0; i<nLi; i++)
    {
      if (Li[i]!=NULL)
	{
	  int_list_free(Li[i]);
	  free(Li[i]);
	}
    }
  free(Li);
  nLi=0;
  nLimax=0;
  free(sorter);

  /* calculate width of v */
  widthv=n2-n1; /* init min(width)=max possible width */
  {
    width=0;
    if (Lv->ixn == length)
      {
	for (j=0; j<=length; j++)
	  {
	    if (Lv->ix[j+1]-Lv->ix[j]>width)
	      {
		width=Lv->ix[j+1]-Lv->ix[j];
	      }
	  }
	if (width<widthv)
	  {
	    widthv=width;
	  }
      }
  }


  /*************************************************************/
  /*                                                           */
  /* Stage 2: minimizing the level width                       */
  /*                                                           */
  /*************************************************************/

  /* printf("\n Stage 2: test \n\n"); */

  /* rebuild Lu and Lv since we need their hintu and hintv */
  int_list_free(Lu);
  err=bandw_level_struct( u, n1, n2, ad, Lu, hintu);
  FUNCTION_FAILURE_HANDLE(err, bandw_level_struct, bandw_gipost);
  int_list_free(Lv);
  err=bandw_level_struct( v, n1, n2, ad, Lv, hintv);
  FUNCTION_FAILURE_HANDLE(err, bandw_level_struct, bandw_gipost);
  /* hintv and hint now hold the pairs of levels for each vertex */

  /* build N, count the vertices whoes level is clear by the pair of
     levels */
  TRY_MALLOC(Nix, length+1, FIDX, bandw_gipost);
  TRY_MALLOC(Hix, length+1, FIDX, bandw_gipost);
  TRY_MALLOC(Lix, length+1, FIDX, bandw_gipost);
  for(i=0; i<=length; i++) Nix[i]=0;
  for (i=0; i<n2-n1; i++)
    {
      if (hintv[i]==length-hintu[i])
	{
	  /* printf("  test:   vertex[%3d]: lvl=%"dFIDX"\n", i, hintv[i]);
	  /* take it out of the game and count it in this level */
	  hint[i]=-1;
	  Nix[hintv[i]]++;
	}
      else
	{
	  /* needs to be assigned to an connected subgraph */
	  hint[i]=0;
	}
    }
  
  /* partition the undecided vertices into connected subgraphs */
  nc=0;
  for (i=0; i<n2-n1; i++) 
    {
      /* check if this vertex starts a new part */
      if (hint[i]==0)
	{
	  /* start a new part, mark all connected nodes recursively */
	  nc++;
	  err=bandw_mark_connected(nc, i+n1, n2, ad->ix, ad->el, n1, hint);
	  FUNCTION_FAILURE_HANDLE(err, bandw_mark_connected,
				  bandw_gipost);
	}
    }
/*   printf("  test: %"dFIDX" connected subgraphs need to be assigned to a level\n", */
/* 	 nc); */
  
  TRY_MALLOC(sorter , 2*nc, FIDX, bandw_gipost);
  for (i=0; i<nc; i++) 
    {
      sorter[2*i  ]=0; /* #vertices in this subgraph */
      sorter[2*i+1]=i;
    }
  for (i=0; i<n2-n1; i++) 
    if (hint[i]>0) sorter[(hint[i]-1)*2]++;
  qsort( sorter, nc, 2*sizeof(FIDX), comp_fidx);
  /* now subgraphs sorted in ascending order of #vertices, start with
     largest subgraph to decide which of the two alternative level
     sets to use */

  /* for (i=0; i<n2-n1; i++) {
     printf(" test:   vertex[%3d]: subgraph=%3d lvl_v=%3d   lvl_u=%3d\n",
     i, hint[i], hintv[i], hintu[i]);
     } /* */

  for (j=nc-1; j>=0; j--)
    {
      FIDX Hixmax, Lixmax, choose;
      /* count the vertex numbers per level which would result from
	 putting the undecided in the level as per v resp. per u */

/*       printf(" test: subgraph[%2d]= %3d    #vx=%3d\n", j, sorter[2*j+1], */
/* 	     sorter[2*j]); */
      for(i=0; i<=length; i++) { Hix[i]=0; Lix[i]=0; }
      for(i=0; i<n2-n1; i++) 
	{
	  /* if in this subgraph */
	  if (hint[i]-1==sorter[2*j+1])
	    {
	      Hix[hintv[i]]++;
	      Lix[length-hintu[i]]++;
	    }
	}
      /*get the maximum of each Lix and Hix (with Nix added to each)*/
      Hixmax=0; Lixmax=0;
      for(i=0; i<=length; i++)
	{
	  if (Lix[i]+Nix[i]>Lixmax) Lixmax=Lix[i]+Nix[i];
	  if (Hix[i]+Nix[i]>Hixmax) Hixmax=Hix[i]+Nix[i];
/* 	  printf(" test: Nix[%3d]=%3d   Lix[]=%3d     Hix[]=%3d\n", */
/* 		 i, Nix[i], Lix[i], Hix[i]); */
	}
      if (Hixmax<Lixmax)
	{
	  choose=1;
	}
      else if (Hixmax>Lixmax)
	{
	  choose=2;
	}
      else
	{
	  /* equally good, decide by the width of Lv and Lu */
	  if (widthv<=widthu) choose=1;
	  else choose=2;
	}
/*       printf(" test:  choose=%"dFIDX"  Hixmax=%"dFIDX", Lixmax=%"dFIDX"    " */
/* 	     "widthv=%"dFIDX", widthu=%"dFIDX"\n", */
/* 	     choose, Hixmax, Lixmax, widthv, widthu); */
      /* now remember what we have decided by modifying hintv and
	 hintu */
      for(i=0; i<n2-n1; i++) 
	{
	  /* if in this subgraph */
	  if (hint[i]-1==sorter[2*j+1])
	    {
	      if (choose==1) hintu[i]=length-hintv[i];
	      else hintv[i]=length-hintu[i];
	    }
	}
      /* update Nix */
      for(i=0; i<=length; i++)
	{
	  if (choose==1) Nix[i]+=Hix[i];
	  else Nix[i]+=Lix[i];
	}
    }
  /* all these guys have now been assigned a level, as can be found in
     hintv[i], the number of vertices in each level can be found in,
     now adjust Lv to give this level structure */
  Lv->ix[0]=0;
  for(i=0; i<=length; i++)
    {
      Lv->ix[i+1]= Lv->ix[i]+Nix[i];
      Nix[i]=0;
    }
  for(i=0; i<n2-n1; i++) 
    {
      FIDX lvl=hintv[i];
      /* printf(" test: lvl[%3d]=%3d   Lv->ix=%3d Nix[lvl]=%3d\n",
	 i, lvl, Lv->ix[lvl], Nix[lvl]); /* */
      Lv->el[Lv->ix[lvl] + Nix[lvl]]=i+n1;
      Nix[lvl]++;
    }

  /* sorter, Lu, Nix, Hix, Lix not needed any more */
  int_list_free(Lu);
  free(Lu);
  free(Nix);
  free(Hix);
  free(Lix);
  free(sorter);
  free(hintu);
  free(hint);


  /*************************************************************/
  /*                                                           */
  /* Stage 3: numbering                                        */
  /*                                                           */
  /*************************************************************/

  /* check the degree of v and u to decide at which end of the
     levelstructure to start */
  if (ad->ix[v+1]-ad->ix[v] <= ad->ix[u+1]-ad->ix[u])
    {
      /* v lower degree, so go upwards in the v levels */
      from=0;
      to=length;
      dir=1;
      tv=v;
    }
  else
    {
      /* v higher degree, so downwards in the v levels */
      from=length;
      to=0;
      dir=-1;
      tv=u;
    }
  /* init the permutation (as undefined) */
  for (i=0; i<n2-n1; i++) 
    { 
      perm[i+n1]  = -1;
      iperm[i+n1] = -1;
    }
  /* allocate workspace for sorting */
  width=widthu;
  if (widthv>width) width=widthv;
  TRY_MALLOC( sorter, 2*width, FIDX, bandw_gipost);

  current=n1;
  /* start with tv, assign all elements of level from with consecutive
     numbers */
  perm[tv]=current;
  iperm[current]=tv;
  current++;
  err=bandw_number_adj_vert( ad, Lv, hintv, from, from, n1, n2,
			     perm, iperm, &current, sorter);
  FUNCTION_FAILURE_HANDLE( err, bandw_number_adj_vert, bandw_gipost);

  /* for all levels: assign the adjecent vertices of the lower level
     first with numbers, then the remaining of the higher level */
/*   printf(" test:  start numbering from=%"dFIDX"  to=%"dFIDX"  dir=%+d   " */
/* 	 "n1=%"dFIDX"  current=%"dFIDX"\n", from, to, dir, n1, current); */
  for (lvl=from+dir; dir*lvl<=dir*to; lvl+=dir)
    {
      /* lower -> higher */
      err=bandw_number_adj_vert( ad, Lv, hintv, lvl-dir, lvl, n1, n2,
				 perm, iperm, &current, sorter);
      FUNCTION_FAILURE_HANDLE( err, bandw_number_adj_vert, bandw_gipost);
      /* higher -> higher */
      err=bandw_number_adj_vert( ad, Lv, hintv, lvl, lvl, n1, n2,
				 perm, iperm, &current, sorter);
      FUNCTION_FAILURE_HANDLE( err, bandw_number_adj_vert, bandw_gipost);
/*       printf(" test:     lvl=%"dFIDX"   current=%"dFIDX"\n", */
/* 	     lvl, current); */
    }



/*   printf(" test: gipost done,  n2=%"dFIDX"   ?=  current=%"dFIDX"\n", n2, current); */

#ifdef DEBUGFEINS
  if (current!=n2)
    {
      fprintf(stderr,"bandw_gipost: failed to assign all vertices \n");
      for (i=n1; i<n2; i++)
	printf(" vertex %3"dFIDX"  assigned  %3"dFIDX", level=%3"dFIDX"\n",
		   i,  perm[i],  hintv[i]);
      /*if (perm[i]==-1)
	fprintf(stderr," vertex %"dFIDX"  not assigned, level=%"dFIDX"\n",
	i, hintv[i]);*/
      return FAIL;
    }
#endif
  

  /* clean up */
  free(sorter);
  int_list_free(Lv);
  free(Lv);
  free(hintv);

  /* compute the bandwidth */
  *bandw=0;
  for (i=n1; i<n2; i++)
    for (j=ad->ix[i]; j<ad->ix[i+1]; j++)
      {
	if (abs(perm[i]-perm[ad->el[j]]) > *bandw)
	  *bandw=abs(perm[i]-perm[ad->el[j]]);
      }

  return SUCCESS;
}

/*FUNCTION*/
int comp_fidx(const void *a1, const void *b1
/* helper function for qsort, compares the integers a and b

   Input:  a1      - pointer to int a
           b1      - pointer to int b

   Return: -1   if a >  b
            0   if a == b
            1   if a <  b
*/
		     ){
  FIDX *a, *b;

  a=(FIDX *) a1;
  b=(FIDX *) b1;

  return *a-*b;
}


/*FUNCTION*/
int bandw_level_struct( FIDX root, FIDX n1, FIDX n2, struct int_list *ad, 
			struct int_list *L, FIDX *hint
/* defines the level structure rooted at root, for a given subgraph

   Input:  root    - root of the level structure
           n1      - index of the first node of the connected subgraph
           n2      - index of the last node +1 of the connected subgraph
           ad      - full adjency list of the whole graph

   In/Out: hint    - integer help vector, lenght n2-n1, has to be
                     provided by the calling routine

   Output: L       - level structure rooted at root, only empty
                     int_list struct has to be given, it is
                     initialised and memory in it allocated

   Return: SUCCESS - success,
           FAIL    - failure, see error message, output will be
                     invalid,
*/
			){
  int  err;
  FIDX i, j, nr, found;


  nr=n2-n1;

  err=int_list_alloc(L, nr, nr);
  FUNCTION_FAILURE_HANDLE(err, int_list_alloc, bandw_level_struct);

  for (i=0; i<nr; i++) hint[i]=-1;

  L->ix[0]=0;
  L->el[0]=root;
  hint[root-n1]=0;
  L->ixn=1;
  L->ix[1]=1;
  if (nr==1)
    {
      fprintf(stderr, "bandw_level_struct: "
	      "called for subgraph of only one node\n");
      return FAIL;
    }


  found=1; /* to start the search for more nodes */
  /* as long as the last level produced vertices in the new level, create
     yet a new level */
  while(found==1)
    {
      FIDX lvl;
      found=0;
      lvl=L->ixn;
      L->ix[lvl+1]=L->ix[lvl];
      /* for all nodes of the last level */
      for (i=L->ix[lvl -1]; i<L->ix[lvl]; i++)
	{
	  FIDX vx, avx;
	  /* mark all adjecent vertices which are assined to a level
	     as belonging to the new level */
	  vx=L->el[i];
	  for (j=ad->ix[vx]; j<ad->ix[vx+1]; j++)
	    {
	      avx=ad->el[j];
	      if (hint[avx-n1]==-1)
		{
		  hint[avx-n1]=lvl;
		  L->el[L->ix[lvl+1]]=avx;
		  L->ix[lvl+1]++;
		  found=1;
		}
	    }
	}
      L->ixn++;
    }
  /* correct the last increment, since nothing was found  */
  L->ixn--;

  /* if the last level is empty, reduce the number of levels again */
  if (L->ix[L->ixn]==L->ix[L->ixn+1]) L->ixn--;

  /* printf("  test: levels=%3"dFIDX", el in last level=%3"dFIDX"\n", L->ixn,
     L->ix[L->ixn+1]-L->ix[L->ixn]); /* */

#ifdef DEBUGFEINS
  for (i=0; i<nr; i++)
    if (hint[i]==-1)
      {
	fprintf(stderr,"bandw_level_struct: node with no level found\n"
		"node=%"dFIDX", n1=%"dFIDX", n2=%"dFIDX", root=%"dFIDX" \n",
		 (i+n1),  n1,  n2,  root);
	
	return FAIL;
      }
#endif
  

  return SUCCESS;
}

/*FUNCTION*/
int bandw_mark_connected( FIDX label, FIDX start, FIDX end, FIDX *adix, 
			  FIDX *adel, FIDX n1, FIDX *marked
/* marks the vertex with label, and all starts a recursion for all
   adjacent vertices i which are marked with marked[i]==0 to mark them
   with the same label, thus all nodes connected to start are marked
   with label, if a vertex i with marked[i]>0 and marked[i]!=label is
   found this is considered an error, thus return will be FAIL,

   Input:  label   - how vertex start and all connected shall be labelled
           start   - the vertex where the labeling starts
           end     - the vertex where the labeling ends
           adix    - vector marking the begin and end of each nodes
                     adjacency list in adel
           adel    - together with adix defines the adjency of the
                     nodes
	   n1      - ofset of the lowest nodenumber in the considered
	             subgraph 
	   

   In/Out: marked  - integer vector, lengh = number of vertices, start
                     and all vertices i connected to start with
                     marked[i]==0, will be marked with makred[i]=label

   Return: SUCCESS - success,
           FAIL    - failure, see error message, output will be
                     invalid,
*/
			){
  FIDX i, j;

  FIDX *rec_start, *rec_j; 
  FIDX rec_depth;

  TRY_MALLOC(rec_start, end-start, FIDX, bandw_mark_connected);
  TRY_MALLOC(rec_j,     end-start, FIDX, bandw_mark_connected);

  rec_depth = 0;

  rec_start[rec_depth] = start;
  rec_j[rec_depth]     = adix[start];


  while (rec_depth>=0)
    {
      start = rec_start[rec_depth];
      marked[start-n1]=label;
        
      j=rec_j[rec_depth];

      /* adjacent vertices left on this level ? */
      if (j<adix[start+1])  
	{
	  /* test adjacent vertex */
	  i=adel[j];
	  if ((marked[i-n1]>0)&&(marked[i-n1]!=label))
	    {
	      fprintf(stderr, "bandw_mark_connected: found vertex allready "
		      "marked[%"dFIDX"]=%"dFIDX"  != %"dFIDX"=label\n",
		       i,  marked[i-n1],  label);
	      return FAIL;
	    }
	  if (marked[i-n1]==0)
	    {
	      /* not yet marked -> go one recursion level deeper,
		 mark all vertices connected to the new start vertex */

	      rec_j[rec_depth]++; /* save (next) position on this
				     recursion level */
	      
	      /* define start at new recursion level */
	      rec_start[rec_depth+1] = i;
	      rec_j[    rec_depth+1] = adix[i];

	      /* go to new recursion level */
	      rec_depth++;
	    }
	  else
	    {
	      /* already marked -> next adjacent vertex */
	      rec_j[rec_depth]++;
	    }
	}
      else
	{
	  /* all adjacent vertices on this level are marked, go down
	     one level */
	  rec_depth--;
	}
    } /* end outer loop of recusion */

  /* clean up */
  free(rec_j);
  free(rec_start);

  return SUCCESS;
}


/*FUNCTION*/
int bandw_number_adj_vert(struct int_list *ad, struct int_list *Lv,
			  FIDX *level, FIDX l1, FIDX l2, FIDX n1, FIDX n2,
			  FIDX *perm, FIDX *iperm, FIDX *current,
			  FIDX *sorter
/* starts with the vertex in level l1 of lowest assigned number,
   assigns all adjecent nodes in level l2 (which have not been
   assigned a number yet) a number in order of increasing degree,
   moves to the next until no further are left

   Input:  ad      - full adjency list
           Lv      - level structure
           level   - integer verctor, specifying the level for each
                     vertex 
           l1      - level in which to start
	   l2      - ofset of the lowest nodenumber in the considered
	             subgraph 
           n1      - numbering offset (begin of the connected subgraph)
           n2      - maximal index of the block (end of the connected
                     subgraph) 
	   

   In/Out: perm    - permutation, which is defined by the assigned
                     numbers integer vector, lengh = number of
                     vertices, every i with perm[i]!=-1 is considered
                     already given a number
           iperm   - inverse permutation to perm, every iperm[i]!=-1
                     is considered already given a number to iperm[i]
           current - current highes assigned number+1 (the next number
                     to assign), updated every time a vertex is
                     assigned a number

   Worksp: sorter  - an integer vector of lenght 2*level_width

   Return: SUCCESS - success,
           FAIL    - failure, see error message, output will be
                     invalid,
*/
			){
  FIDX i, j, sl;

  FIDX lowest_l1, found_new;

  /* find lowest number in level l1 */
  i=*current-1;
  while ((level[iperm[i]-n1]==l1)&&(i>n1)) i--;
  if (level[iperm[i]-n1]!=l1) i++;
  lowest_l1=i;

#ifdef DEBUGFEINS
  if (level[iperm[lowest_l1]-n1]!=l1)
    {
      fprintf(stderr, "bandw_number_adj_vert: something wrong, nothing "
	      "numbered in lower level!\n");
      return FAIL;
    }
#endif

  /* printf(" test: lowest_l1= %3"dFIDX"   l1=%3"dFIDX"   l2=%3"dFIDX"\n", lowest_l1,
     l1, l2); /* */

  /* as long as there apear unnumbered nodes in the level... */
  found_new=1;
  while(found_new!=0)
    {
      found_new=0;
      
      /* check if all adjecent nodes to lowest_l1 are numbered yet: */
      if ((iperm[lowest_l1]>=0)&&(level[iperm[lowest_l1]-n1]==l1))
	{
	  i=iperm[lowest_l1];
	  /* printf(" test: check connected to %"dFIDX":", i); /* */
	  /* set sorter length (sl) to 0 = number of found unnumbered
	     adjecent vertices */
	  sl=0;
	  /* for all adjecent nodes */
	  for (j=ad->ix[i]; j<ad->ix[i+1]; j++)
	    {
	      FIDX adj_vx;
	      adj_vx=ad->el[j];
	  /* printf("   test: adj_vx=%3"dFIDX"    level[adj]=%3"dFIDX"   perm[adj]=%3"dFIDX"\n",
		 adj_vx, level[adj_vx-n1], perm[adj_vx]); /* */
	      /* if in level l2 and not numbered */
	      if ((level[adj_vx-n1]==l2)&&(perm[adj_vx]==-1))
		{
		  /* add to the sorter list */
		  sorter[sl*2+1]=adj_vx;
		  /* sorting is by degree of the vertices */
		  sorter[sl*2  ]=ad->ix[adj_vx+1]-ad->ix[adj_vx];

		  found_new++;
		  sl++;
		}
	    }
	  /* printf("  %3"dFIDX" are connected\n",sl); /* */
	  /* if vertices are found, sort them by degree, number them
	     in increasing order */
	  if (sl>0)
	    {
	      qsort( sorter, sl, 2*sizeof(FIDX), comp_fidx);
	      for (j=0; j<sl; j++)
		{
		  FIDX tv;
		  tv=sorter[j*2+1];
		  /* printf(" test: sorter[%3"dFIDX"]= [ %3"dFIDX", %3"dFIDX" ]   current=%3"dFIDX"\n",
		     j, sorter[j*2], sorter[j*2+1], *current); /* */
		  perm[tv]=*current;
		  iperm[*current]=tv;
		  (*current)++;

#ifdef DEBUGFEINS
		  if (*current>n2)
		    {
		      fprintf(stderr, "bandw_number_adj_vert: "
			      "something wrong, try to number over "
			      "limit: current=%"dFIDX"   n2=%"dFIDX"\n",
			       *current,  n2);
		      return FAIL;
		    }
#else 
		  if (*current==n2) return SUCCESS;
#endif
		}
	    }

	  /* increase the lowest_l1 counter, see if it is still l1 */
	  if (lowest_l1<n2-1)
	    {
	      lowest_l1++;
	      if ((iperm[lowest_l1]>=0)&&(level[iperm[lowest_l1]-n1]==l1))
		{
		  /* still l1 ==> make sure its adjacent vertices are
		     numbered first */
		  if (found_new==0) found_new=1;
		}
	    }
	} /* end numbering of adjacent vertices to numbered ones */

      /* if the levels are identical and no adjacent vertices to
	 numbered ones are found, check if there are unnumbered ones
	 left, if so assign the one of lowest degree with a number */ 
      if ((l1==l2)&&(found_new==0))
	{
	  FIDX tv, min_deg, candidate;
	  min_deg=-1;
	  /* check all vertices in this level */
	  for (i=Lv->ix[l1]; i<Lv->ix[l1+1]; i++)
	    {
	      tv=Lv->el[i];
	      if (perm[tv]==-1)
		{
		  /* see if it is a condidate for min degree */
		  if (min_deg==-1)
		    {
		      /* the first one -> is candidate */
		      candidate=tv;
		      min_deg=ad->ix[tv+1] - ad->ix[tv];
		    }
		  else if (ad->ix[tv+1] - ad->ix[tv] < min_deg)
		    {
		      /* not the first but lower degree */
		      candidate=tv;
		      min_deg=ad->ix[tv+1] - ad->ix[tv];
		    }
		}
	    } /* end all vertices of the level */
	  /* if a new node of minimal degree was found, number it */
	  if (min_deg!=-1)
	    {
	      tv=candidate;
	      perm[tv]=*current;
	      iperm[*current]=tv;
	      (*current)++;
	      lowest_l1=tv;
	      found_new=1;
#ifdef DEBUGFEINS
	      if (*current>n2)
		{
		  fprintf(stderr, "bandw_number_adj_vert: "
			  "something wrong, try to number over "
			  "limit: current=%"dFIDX"   n2=%"dFIDX"\n",
			   *current,  n2);
		  return FAIL;
		}
#else 
	      if (*current==n2) return SUCCESS;
#endif
	    }
	} /* end check for unnumbered if l1==l2 */
    } /* end check for nodes to be numbered */

  /* done */
  return SUCCESS;
}



