/*  *   sample code for *   STENCIL OPTIMIZATION *  *   Michael Schlenkrich *   Silicon Graphics *   November 1996 *  */ #include <sys/types.h>#include <sys/times.h>#include <malloc.h>#include <stdlib.h>#include <stdio.h>#include <limits.h>#include <math.h> /* R10K specific definition */#define MHZ 195double nanoseconds_per_cycle = 1000./MHZ ; #define P_AHEAD 16int p_ahead = P_AHEAD;long allocated_bytes;void Stencil_27Point_Isotrope_ORG(			      double***  field_out,			      double***  field_in,			      int       size_x1,			      int       size_x2,			      int       size_x3,			      double    kernel_weights[3][3][3]			      ){   int    i,j,k;  double weight0,weight1,weight2,weight3,out0,out1,out2,out3;  weight0 = kernel_weights[1][1][1];  weight1 = kernel_weights[1][1][0];  weight2 = kernel_weights[1][0][0];  weight3 = kernel_weights[0][0][0];     for ( i=1; i< size_x1-1 ;i++)    for ( j=1; j< size_x2-1 ;j++)#pragma ivdep       for ( k=1; k< size_x3-1 ;k++)	{ 	  out3 = weight3*(    field_in[i-1][j-1][k-1]			    + field_in[i+1][j-1][k-1]			    + field_in[i-1][j+1][k-1]			    + field_in[i+1][j+1][k-1]			    + field_in[i-1][j-1][k+1]			    + field_in[i+1][j-1][k+1]			    + field_in[i-1][j+1][k+1]			    + field_in[i+1][j+1][k+1] );	  out2 = weight2*(    field_in[i  ][j-1][k-1]			    + field_in[i-1][j  ][k-1]			    + field_in[i+1][j  ][k-1]			    + field_in[i  ][j+1][k-1]			    + field_in[i-1][j-1][k  ]			    + field_in[i+1][j-1][k  ]			    + field_in[i-1][j+1][k  ]			    + field_in[i+1][j+1][k  ]			    + field_in[i  ][j-1][k+1]			    + field_in[i-1][j  ][k+1]			    + field_in[i+1][j  ][k+1]			    + field_in[i  ][j+1][k+1] );	  out1 = weight1*(    field_in[i  ][j  ][k-1]			    + field_in[i  ][j  ][k+1]			    + field_in[i  ][j+1][k  ]			    + field_in[i+1][j  ][k  ]			    + field_in[i-1][j  ][k  ]			    + field_in[i  ][j-1][k  ] );	  	  out0 = weight0*     field_in[i  ][j  ][k  ]; 	  field_out[i][j][k] = out0+out1+out2+out3; 	}   return;}void Stencil_27Point_Isotrope_O1(			      double***  field_out,			      double***  field_in,			      int       size_x1,			      int       size_x2,			      int       size_x3,			      double    kernel_weights[3][3][3]			      ){   int    i,j,k;  double weight0,weight1,weight2,weight3,out0,out1,out2,out3;    double **pout_i, *pout_i_j;  double **pin_im, **pin_i0, **pin_ip;  double *pin_im_jm, *pin_im_j0, *pin_im_jp,         *pin_i0_jm, *pin_i0_j0, *pin_i0_jp,         *pin_ip_jm, *pin_ip_j0, *pin_ip_jp;    weight0 = kernel_weights[1][1][1];  weight1 = kernel_weights[1][1][0];  weight2 = kernel_weights[1][0][0];  weight3 = kernel_weights[0][0][0];    pin_im = field_in[0];    pin_i0 = field_in[1];    pin_ip = field_in[2];     for ( i=1; i< size_x1-1 ;i++) {    pin_im = field_in[i-1];    pin_i0 = field_in[i  ];    pin_ip = field_in[i+1];    pout_i = field_out[i];     for ( j=1; j< size_x2-1 ;j++) {      pin_im_jm = pin_im[j-1];       pin_im_j0 = pin_im[j  ];       pin_im_jp = pin_im[j+1];       pin_i0_jm = pin_i0[j-1];       pin_i0_j0 = pin_i0[j  ];       pin_i0_jp = pin_i0[j+1];       pin_ip_jm = pin_ip[j-1];       pin_ip_j0 = pin_ip[j  ];       pin_ip_jp = pin_ip[j+1];       pout_i_j = pout_i[j];#pragma ivdep      for ( k=1; k< size_x3-1 ;k++)	{ 	  out3 = weight3*(    pin_im_jm[k-1]			    + pin_ip_jm[k-1]			    + pin_im_jp[k-1]			    + pin_ip_jp[k-1]			    + pin_im_jm[k+1]			    + pin_ip_jm[k+1]			    + pin_im_jp[k+1]			    + pin_ip_jp[k+1] );	  out2 = weight2*(    pin_i0_jm[k-1]			    + pin_im_j0[k-1]			    + pin_ip_j0[k-1]			    + pin_i0_jp[k-1]			    + pin_im_jm[k  ]			    + pin_ip_jm[k  ]			    + pin_im_jp[k  ]			    + pin_ip_jp[k  ]			    + pin_i0_jm[k+1]			    + pin_im_j0[k+1]			    + pin_ip_j0[k+1]			    + pin_i0_jp[k+1] );	  out1 = weight1*(    pin_i0_j0[k-1]			    + pin_i0_j0[k+1]			    + pin_i0_jp[k  ]			    + pin_ip_j0[k  ]			    + pin_im_j0[k  ]			    + pin_i0_jm[k  ] );	  	  out0 = weight0*     pin_i0_j0[k  ]; 	  pout_i_j[k] = out0+out1+out2+out3;  	}    }   }  return;}void Stencil_27Point_Isotrope_O2(			      double***  field_out,			      double***  field_in,			      int       size_x1,			      int       size_x2,			      int       size_x3,			      double    kernel_weights[3][3][3]			      ){   int    i,j,k;  double weight0,weight1,weight2,weight3,out0,out1,out2,out3;    double **pout_i, *pout_i_j;  double **pin_im, **pin_i0, **pin_ip;  double *pin_im_jm, *pin_im_j0, *pin_im_jp,         *pin_i0_jm, *pin_i0_j0, *pin_i0_jp,         *pin_ip_jm, *pin_ip_j0, *pin_ip_jp;    weight0 = kernel_weights[1][1][1];  weight1 = kernel_weights[1][1][0];  weight2 = kernel_weights[1][0][0];  weight3 = kernel_weights[0][0][0];    pin_im = field_in[0];    pin_i0 = field_in[1];    pin_ip = field_in[2];     for ( i=1; i< size_x1-1 ;i++) {    pin_im = field_in[i-1];    pin_i0 = field_in[i  ];    pin_ip = field_in[i+1];    pout_i = field_out[i];     for ( j=1; j< size_x2-1 ;j++) {      pin_im_jm = pin_im[j-1];       pin_im_j0 = pin_im[j  ];       pin_im_jp = pin_im[j+1];       pin_i0_jm = pin_i0[j-1];       pin_i0_j0 = pin_i0[j  ];       pin_i0_jp = pin_i0[j+1];       pin_ip_jm = pin_ip[j-1];       pin_ip_j0 = pin_ip[j  ];       pin_ip_jp = pin_ip[j+1];       pout_i_j = pout_i[j];#pragma ivdep      for ( k=1; k< size_x3-1 ;k++)	{ #pragma prefetch_ref=pin_ip_jp[k+16], stride=16, kind=rd#pragma prefetch_ref=pout_i_j[k+16], stride=16, kind=wr	  out3 = weight3*(    pin_im_jm[k-1]			    + pin_ip_jm[k-1]			    + pin_im_jp[k-1]			    + pin_ip_jp[k-1]			    + pin_im_jm[k+1]			    + pin_ip_jm[k+1]			    + pin_im_jp[k+1]			    + pin_ip_jp[k+1] );	  out2 = weight2*(    pin_i0_jm[k-1]			    + pin_im_j0[k-1]			    + pin_ip_j0[k-1]			    + pin_i0_jp[k-1]			    + pin_im_jm[k  ]			    + pin_ip_jm[k  ]			    + pin_im_jp[k  ]			    + pin_ip_jp[k  ]			    + pin_i0_jm[k+1]			    + pin_im_j0[k+1]			    + pin_ip_j0[k+1]			    + pin_i0_jp[k+1] );	  out1 = weight1*(    pin_i0_j0[k-1]			    + pin_i0_j0[k+1]			    + pin_i0_jp[k  ]			    + pin_ip_j0[k  ]			    + pin_im_j0[k  ]			    + pin_i0_jm[k  ] );	  	  out0 = weight0*     pin_i0_j0[k  ]; 	  pout_i_j[k] = out0+out1+out2+out3; #ifdef DD	  	  if((i<3)&&(j==1)&&(k==1)) printf ("**%d %f %f %f %f **\n", i, out0, out1, out2, out3);#endif 	}    }   }  return;}void Stencil_27Point_Isotrope_O3(			      double***  field_out,			      double***  field_in,			      int       size_x1,			      int       size_x2,			      int       size_x3,			      double    kernel_weights[3][3][3]			      ){   int    i,j,k;  double weight0,weight1,weight2,weight3,out0,out1,out2,out3;    double **pout_i, *pout_i_j;  double **pin_im, **pin_i0, **pin_ip;  double *pin_im_jm, *pin_im_j0, *pin_im_jp,         *pin_i0_jm, *pin_i0_j0, *pin_i0_jp,         *pin_ip_jm, *pin_ip_j0, *pin_ip_jp;    weight0 = kernel_weights[1][1][1];  weight1 = kernel_weights[1][1][0];  weight2 = kernel_weights[1][0][0];  weight3 = kernel_weights[0][0][0];    pin_im = field_in[0];    pin_i0 = field_in[1];    pin_ip = field_in[2];     for ( i=1; i< size_x1-1 ;i++) {    pin_im = field_in[i-1];    pin_i0 = field_in[i  ];    pin_ip = field_in[i+1];    pout_i = field_out[i];     for ( j=1; j< size_x2-1 ;j++) {      pin_im_jm = pin_im[j-1];       pin_im_j0 = pin_im[j  ];       pin_im_jp = pin_im[j+1];       pin_i0_jm = pin_i0[j-1];       pin_i0_j0 = pin_i0[j  ];       pin_i0_jp = pin_i0[j+1];       pin_ip_jm = pin_ip[j-1];       pin_ip_j0 = pin_ip[j  ];       pin_ip_jp = pin_ip[j+1];       pout_i_j = pout_i[j];#pragma ivdep      for ( k=1; k< size_x3-1 ;k++)	{ #pragma prefetch_ref=pin_ip_jp[k+16], stride=16, kind=rd#pragma prefetch_ref=pin_i0_jp[k+8], stride=4, kind=rd#pragma prefetch_ref=pin_im_jp[k+8], stride=4, kind=rd#pragma prefetch_ref=pout_i_j[k+16], stride=16, kind=wr	  out3 = weight3*(    pin_im_jm[k-1]			    + pin_ip_jm[k-1]			    + pin_im_jp[k-1]			    + pin_ip_jp[k-1]			    + pin_im_jm[k+1]			    + pin_ip_jm[k+1]			    + pin_im_jp[k+1]			    + pin_ip_jp[k+1] );	  out2 = weight2*(    pin_i0_jm[k-1]			    + pin_im_j0[k-1]			    + pin_ip_j0[k-1]			    + pin_i0_jp[k-1]			    + pin_im_jm[k  ]			    + pin_ip_jm[k  ]			    + pin_im_jp[k  ]			    + pin_ip_jp[k  ]			    + pin_i0_jm[k+1]			    + pin_im_j0[k+1]			    + pin_ip_j0[k+1]			    + pin_i0_jp[k+1] );	  out1 = weight1*(    pin_i0_j0[k-1]			    + pin_i0_j0[k+1]			    + pin_i0_jp[k  ]			    + pin_ip_j0[k  ]			    + pin_im_j0[k  ]			    + pin_i0_jm[k  ] );	  	  out0 = weight0*     pin_i0_j0[k  ]; 	  pout_i_j[k] = out0+out1+out2+out3;  	}    }   }  return;}void Stencil_27Point_Isotrope_O4(			      double***  field_out,			      double***  field_in,			      int       size_x1,			      int       size_x2,			      int       size_x3,			      double    kernel_weights[3][3][3]			      ){   int    i,j,k;  double weight0,weight1,weight2,weight3,out0,out1,out2,out3;  int off;    double **pout_i, *pout_i_j;  double **pout_ip, *pout_ip_j;  double **pin_im, **pin_i0, **pin_ip, **pin_ipp;  double *pin_im_jm, *pin_im_j0, *pin_im_jp,         *pin_i0_jm, *pin_i0_j0, *pin_i0_jp,         *pin_ip_jm, *pin_ip_j0, *pin_ip_jp,          *pin_ipp_jm, *pin_ipp_j0, *pin_ipp_jp;  double t1_km, t1_k0, t1_kp, tp1_km, tp1_k0, tp1_kp,          t2_km, t2_k0, t2_kp, tp2_km, tp2_k0, tp2_kp,         t3_km, t3_k0, t3_kp, tp3_km, tp3_k0, tp3_kp;  double tmp1,tmp2;     weight0 = kernel_weights[1][1][1];  weight1 = kernel_weights[1][1][0];  weight2 = kernel_weights[1][0][0];  weight3 = kernel_weights[0][0][0];  off = (size_x1-2)%2;    pin_im = field_in[0];    pin_i0 = field_in[1];    pin_ip = field_in[2];     for ( i=1; i< 1+off ;i++) {    pin_im = field_in[i-1];    pin_i0 = field_in[i  ];    pin_ip = field_in[i+1];    pout_i = field_out[i];     for ( j=1; j< size_x2-1 ;j++) {      pin_im_jm = pin_im[j-1];       pin_im_j0 = pin_im[j  ];       pin_im_jp = pin_im[j+1];       pin_i0_jm = pin_i0[j-1];       pin_i0_j0 = pin_i0[j  ];       pin_i0_jp = pin_i0[j+1];       pin_ip_jm = pin_ip[j-1];       pin_ip_j0 = pin_ip[j  ];       pin_ip_jp = pin_ip[j+1];       pout_i_j = pout_i[j];          t1_km  = pin_i0_j0[0];          t2_km  = pin_i0_jm[0] + pin_ip_j0[0]                 + pin_i0_jp[0] + pin_im_j0[0];          t3_km  = pin_im_jm[0] + pin_ip_jm[0]                 + pin_im_jp[0] + pin_ip_jp[0];          t1_k0  = pin_i0_j0[1];          t2_k0  = pin_i0_jm[1] + pin_ip_j0[1]                 + pin_i0_jp[1] + pin_im_j0[1];          t3_k0  = pin_im_jm[1] + pin_ip_jm[1]                 + pin_im_jp[1] + pin_ip_jp[1];#pragma ivdep      for ( k=1; k< size_x3-1 ;k++)	{ #pragma prefetch_ref=pin_ip_jp[k+16], stride=4, kind=rd#pragma prefetch_ref=pout_i_j[k+16], stride=4, kind=wr          t1_kp  = pin_i0_j0[k+1];          t2_kp  = pin_i0_jm[k+1] + pin_ip_j0[k+1]                 + pin_i0_jp[k+1] + pin_im_j0[k+1];          t3_kp  = pin_im_jm[k+1] + pin_ip_jm[k+1]                 + pin_im_jp[k+1] + pin_ip_jp[k+1]; 	  out3 = weight3*( t3_kp         + t3_km );	  out2 = weight2*( t2_kp + t3_k0 + t2_km );	  out1 = weight1*( t1_kp + t2_k0 + t1_km );	  	  out0 = weight0*          t1_k0; 	  pout_i_j[k] = out0+out1+out2+out3;           t1_km = t1_k0; t2_km = t2_k0; t3_km = t3_k0;          t1_k0 = t1_kp; t2_k0 = t2_kp; t3_k0 = t3_kp; 	}    }   }  for ( i=1+off; i< size_x1-1 ;i+=2) {    pin_im = field_in[i-1];    pin_i0 = field_in[i  ];    pin_ip = field_in[i+1];    pin_ipp = field_in[i+2];    pout_i = field_out[i];     pout_ip = field_out[i+1];     for ( j=1; j< size_x2-1 ;j++) {      pin_im_jm = pin_im[j-1];       pin_im_j0 = pin_im[j  ];       pin_im_jp = pin_im[j+1];       pin_i0_jm = pin_i0[j-1];       pin_i0_j0 = pin_i0[j  ];       pin_i0_jp = pin_i0[j+1];       pin_ip_jm = pin_ip[j-1];       pin_ip_j0 = pin_ip[j  ];       pin_ip_jp = pin_ip[j+1];       pin_ipp_jm = pin_ipp[j-1];       pin_ipp_j0 = pin_ipp[j  ];       pin_ipp_jp = pin_ipp[j+1];       pout_i_j = pout_i[j];      pout_ip_j = pout_ip[j];          t1_km  = pin_i0_j0[0];          t2_km  = pin_i0_jm[0] + pin_ip_j0[0]                 + pin_i0_jp[0] + pin_im_j0[0];          t3_km  = pin_im_jm[0] + pin_ip_jm[0]                 + pin_im_jp[0] + pin_ip_jp[0];          tp1_km = pin_ip_j0[0];          tp2_km = pin_ip_jm[0] + pin_ipp_j0[0]                 + pin_ip_jp[0] + pin_i0_j0[0];          tp3_km = pin_i0_jm[0] + pin_ipp_jm[0]                 + pin_i0_jp[0] + pin_ipp_jp[0];          t1_k0  = pin_i0_j0[1];          t2_k0  = pin_i0_jm[1] + pin_ip_j0[1]                 + pin_i0_jp[1] + pin_im_j0[1];          t3_k0  = pin_im_jm[1] + pin_ip_jm[1]                 + pin_im_jp[1] + pin_ip_jp[1];          tp1_k0 = pin_ip_j0[1];          tp2_k0 = pin_ip_jm[1] + pin_ipp_j0[1]                 + pin_ip_jp[1] + pin_i0_j0[1];          tp3_k0 = pin_i0_jm[1] + pin_ipp_jm[1]                 + pin_i0_jp[1] + pin_ipp_jp[1];#pragma ivdep      for ( k=1; k< size_x3-1 ;k++)	{ #pragma prefetch_ref=pin_ip_jp[k+16], stride=4, kind=rd#pragma prefetch_ref=pin_ipp_jp[k+16], stride=4, kind=rd#pragma prefetch_ref=pin_i0_jp[k+4], stride=4, kind=rd#pragma prefetch_ref=pin_im_jp[k+4], stride=4, kind=rd#pragma prefetch_ref=pout_i_j[k+16], stride=4, kind=wr#pragma prefetch_ref=pout_ip_j[k+16], stride=4, kind=wr          tmp1   = pin_ip_jm[k+1] + pin_ip_jp[k+1];          tmp2   = pin_i0_jm[k+1] + pin_i0_jp[k+1];          t1_kp  = pin_i0_j0[k+1];          t2_kp  = tmp2 + pin_ip_j0[k+1] + pin_im_j0[k+1];          t3_kp  = tmp1 + pin_im_jm[k+1] + pin_im_jp[k+1];          tp1_kp = pin_ip_j0[k+1];          tp2_kp = tmp1 + pin_ipp_j0[k+1] + pin_i0_j0[k+1];          tp3_kp = tmp2 + pin_ipp_jm[k+1] + pin_ipp_jp[k+1]; 	  out3 = weight3*( t3_kp         + t3_km );	  out2 = weight2*( t2_kp + t3_k0 + t2_km );	  out1 = weight1*( t1_kp + t2_k0 + t1_km );	  	  out0 = weight0*          t1_k0; 	  pout_i_j[k] = out0+out1+out2+out3; 	  	  	  out3 = weight3*( tp3_kp          + tp3_km );	  out2 = weight2*( tp2_kp + tp3_k0 + tp2_km );	  out1 = weight1*( tp1_kp + tp2_k0 + tp1_km );	  	  out0 = weight0*           tp1_k0; 	  pout_ip_j[k] = out0+out1+out2+out3;            t1_km = t1_k0; t2_km = t2_k0; t3_km = t3_k0;          t1_k0 = t1_kp; t2_k0 = t2_kp; t3_k0 = t3_kp;          tp1_km = tp1_k0; tp2_km = tp2_k0; tp3_km = t3_k0;          tp1_k0 = tp1_kp; tp2_k0 = tp2_kp; tp3_k0 = t3_kp; 	}    }   }  return;}void Stencil_27Point_Isotrope_O4b(			      double***  field_out,			      double***  field_in,			      int       size_x1,			      int       size_x2,			      int       size_x3,			      double    kernel_weights[3][3][3]			      ){   int    i,j,k;  double weight0,weight1,weight2,weight3,out0,out1,out2,out3;  int off;    double **pout_i, *pout_i_j;  double **pout_ip, *pout_ip_j;  double **pin_im, **pin_i0, **pin_ip, **pin_ipp;  double *pin_im_jm, *pin_im_j0, *pin_im_jp,         *pin_i0_jm, *pin_i0_j0, *pin_i0_jp,         *pin_ip_jm, *pin_ip_j0, *pin_ip_jp,          *pin_ipp_jm, *pin_ipp_j0, *pin_ipp_jp;    weight0 = kernel_weights[1][1][1];  weight1 = kernel_weights[1][1][0];  weight2 = kernel_weights[1][0][0];  weight3 = kernel_weights[0][0][0];  off = (size_x1-2)%2;    pin_im = field_in[0];    pin_i0 = field_in[1];    pin_ip = field_in[2];     for ( i=1; i< 1+off ;i++) {    pin_im = field_in[i-1];    pin_i0 = field_in[i  ];    pin_ip = field_in[i+1];    pout_i = field_out[i];     for ( j=1; j< size_x2-1 ;j++) {      pin_im_jm = pin_im[j-1];       pin_im_j0 = pin_im[j  ];       pin_im_jp = pin_im[j+1];       pin_i0_jm = pin_i0[j-1];       pin_i0_j0 = pin_i0[j  ];       pin_i0_jp = pin_i0[j+1];       pin_ip_jm = pin_ip[j-1];       pin_ip_j0 = pin_ip[j  ];       pin_ip_jp = pin_ip[j+1];       pout_i_j = pout_i[j];#pragma ivdep      for ( k=1; k< size_x3-1 ;k++)	{ #pragma prefetch_ref=pin_ip_jp[k+16], stride=4, kind=rd#pragma prefetch_ref=pout_i_j[k+16], stride=4, kind=wr	  out3 = weight3*(    pin_im_jm[k-1]			    + pin_ip_jm[k-1]			    + pin_im_jp[k-1]			    + pin_ip_jp[k-1]			    + pin_im_jm[k+1]			    + pin_ip_jm[k+1]			    + pin_im_jp[k+1]			    + pin_ip_jp[k+1] );	  out2 = weight2*(    pin_i0_jm[k-1]			    + pin_im_j0[k-1]			    + pin_ip_j0[k-1]			    + pin_i0_jp[k-1]			    + pin_im_jm[k  ]			    + pin_ip_jm[k  ]			    + pin_im_jp[k  ]			    + pin_ip_jp[k  ]			    + pin_i0_jm[k+1]			    + pin_im_j0[k+1]			    + pin_ip_j0[k+1]			    + pin_i0_jp[k+1] );	  out1 = weight1*(    pin_i0_j0[k-1]			    + pin_i0_j0[k+1]			    + pin_i0_jp[k  ]			    + pin_ip_j0[k  ]			    + pin_im_j0[k  ]			    + pin_i0_jm[k  ] );	  	  out0 = weight0*     pin_i0_j0[k  ]; 	  pout_i_j[k] = out0+out1+out2+out3;  	}    }   }  for ( i=1+off; i< size_x1-1 ;i+=2) {    pin_im = field_in[i-1];    pin_i0 = field_in[i  ];    pin_ip = field_in[i+1];    pin_ipp = field_in[i+2];    pout_i = field_out[i];     pout_ip = field_out[i+1];     for ( j=1; j< size_x2-1 ;j++) {      pin_im_jm = pin_im[j-1];       pin_im_j0 = pin_im[j  ];       pin_im_jp = pin_im[j+1];       pin_i0_jm = pin_i0[j-1];       pin_i0_j0 = pin_i0[j  ];       pin_i0_jp = pin_i0[j+1];       pin_ip_jm = pin_ip[j-1];       pin_ip_j0 = pin_ip[j  ];       pin_ip_jp = pin_ip[j+1];       pin_ipp_jm = pin_ipp[j-1];       pin_ipp_j0 = pin_ipp[j  ];       pin_ipp_jp = pin_ipp[j+1];       pout_i_j = pout_i[j];      pout_ip_j = pout_ip[j];#pragma ivdep      for ( k=1; k< size_x3-1 ;k++)	{ #pragma prefetch_ref=pin_ip_jp[k+16], stride=4, kind=rd#pragma prefetch_ref=pin_ipp_jp[k+16], stride=4, kind=rd#pragma prefetch_ref=pin_i0_jp[k+8], stride=4, kind=rd#pragma prefetch_ref=pin_im_jp[k+8], stride=4, kind=rd#pragma prefetch_ref=pout_i_j[k+16], stride=4, kind=wr#pragma prefetch_ref=pout_ip_j[k+16], stride=4, kind=wr	  out3 = weight3*(    pin_im_jm[k-1]			    + pin_ip_jm[k-1]			    + pin_im_jp[k-1]			    + pin_ip_jp[k-1]			    + pin_im_jm[k+1]			    + pin_ip_jm[k+1]			    + pin_im_jp[k+1]			    + pin_ip_jp[k+1] );	  out2 = weight2*(    pin_i0_jm[k-1]			    + pin_im_j0[k-1]			    + pin_ip_j0[k-1]			    + pin_i0_jp[k-1]			    + pin_im_jm[k  ]			    + pin_ip_jm[k  ]			    + pin_im_jp[k  ]			    + pin_ip_jp[k  ]			    + pin_i0_jm[k+1]			    + pin_im_j0[k+1]			    + pin_ip_j0[k+1]			    + pin_i0_jp[k+1] );	  out1 = weight1*(    pin_i0_j0[k-1]			    + pin_i0_j0[k+1]			    + pin_i0_jp[k  ]			    + pin_ip_j0[k  ]			    + pin_im_j0[k  ]			    + pin_i0_jm[k  ] );	  	  out0 = weight0*     pin_i0_j0[k  ]; 	  pout_i_j[k] = out0+out1+out2+out3; 	  #ifdef DD	  	  if((i<3)&&(j==1)&&(k==1)) printf ("**%d %f %f %f %f **\n", i, out0, out1, out2, out3);#endif	  	  out3 = weight3*(    pin_i0_jm[k-1]			    + pin_ipp_jm[k-1]			    + pin_i0_jp[k-1]			    + pin_ipp_jp[k-1]			    + pin_i0_jm[k+1]			    + pin_ipp_jm[k+1]			    + pin_i0_jp[k+1]			    + pin_ipp_jp[k+1] );	  out2 = weight2*(    pin_ip_jm[k-1]			    + pin_i0_j0[k-1]			    + pin_ipp_j0[k-1]			    + pin_ip_jp[k-1]			    + pin_i0_jm[k  ]			    + pin_ipp_jm[k  ]			    + pin_i0_jp[k  ]			    + pin_ipp_jp[k  ]			    + pin_ip_jm[k+1]			    + pin_i0_j0[k+1]			    + pin_ipp_j0[k+1]			    + pin_ip_jp[k+1] );	  out1 = weight1*(    pin_ip_j0[k-1]			    + pin_ip_j0[k+1]			    + pin_ip_jp[k  ]			    + pin_ipp_j0[k  ]			    + pin_i0_j0[k  ]			    + pin_ip_jm[k  ] );	  	  out0 = weight0*     pin_ip_j0[k  ]; 	  pout_ip_j[k] = out0+out1+out2+out3; #ifdef DD	  	  if((i<3)&&(j==1)&&(k==1)) printf ("**%d %f %f %f %f **\n", i+1, out0, out1, out2, out3);#endif 	}    }   }  return;}double timer() {    struct tms tim;    clock_t check;    check = times (&tim);    if (check == -1) { printf("\n times problem\n"); return (0);}    return (tim.tms_utime/( (double) CLK_TCK));}#ifdef PADDINGint padded( int dim){  double c;  int n,n2,pad;  pad = dim;  if (dim > 30) {    n = dim%32;    if (n==0) pad = dim + 2;    if (n==1) pad = dim + 1;    if (n==31) pad = dim + 3;  }  return (pad);}#elseint padded( int dim){  return (dim);}#endifdouble ***Alloc_Double_3D_Matrix(                      int xdim,                      int ydim,                      int zdim                      ){  double ***v;  int    i, j;  long   yydim, zzdim;  yydim = (long) padded(ydim);  zzdim = (long) padded(zdim);  if (zzdim != zdim)  printf ("** padding: %d -> %d  \n",zdim,zzdim);  if (xdim==0 || ydim==0 || zdim==0) return (NULL);    if ( (v=        (double ***)calloc((size_t) xdim,(size_t) sizeof(double **)))       == NULL) return(NULL);  if ( (v[0]=        (double **)calloc((size_t) xdim*ydim,(size_t) sizeof(double *)))       == NULL) return(NULL);  for (i=0; i<xdim; i++)    v[i] = v[0] + ydim*i;  if ( (v[0][0] =        (double *)calloc((size_t) xdim*yydim*zzdim,(size_t) sizeof(double)))       == NULL) return(NULL);  for (i=0; i<xdim; i++)    for (j=0; j<ydim; j++)      v[i][j] = v[0][0] + zzdim*j + zzdim*yydim*i;  allocated_bytes += sizeof(double **)*xdim;  allocated_bytes += sizeof(double *)*xdim*ydim;  allocated_bytes += sizeof(double)*xdim*yydim*zzdim;  #ifdef DEBUG  printf ("memory allocated: %d \n", allocated_bytes);#endif  return v;}static  double ***field_in;static  double ***field_out; int main(int argc, char *argv[]){   int i,j,k;  int size_x1, size_x2, size_x3 ,size, bnd_size, l_size_x1, l_size_x2;  static double kernel_weights[3][3][3];  int irep,repeat,version;  double timing;  double weight0, weight1, weight2, weight3;  if (argc > 1)	  repeat = atoi(argv[1]); else repeat = 100;  if (argc > 2)	  size = atoi(argv[2]); else size = 10;  if (argc > 3)	  version = atoi(argv[3]); else version = 0;  size_x1 = size_x2 = size_x3 = size;  printf(" stencil  %d  %d  %d\n",repeat,size,version);  printf(" repeating stencil:    %d\n", repeat);  printf(" grid-size:            %d x %d x %d = %d\n",		size_x1, size_x2, size_x3,size_x1*size_x2*size_x3);  printf(" optimization version: %d\n", version);  field_in  = Alloc_Double_3D_Matrix(size_x1,size_x2,size_x3);  field_out = Alloc_Double_3D_Matrix(size_x1,size_x2,size_x3);  if ((field_in==NULL)||(field_out==NULL)) {      printf (" not able to allocate arrays!\n ");      exit(1);  }#ifdef FORTRAN/* in FORTRAN the leading dimension is needed */   l_size_x2 = padded(size_x2);  l_size_x1 = padded(size_x1);#endif   printf(" size of the 3D-grid: %12.6f MByte \n",                8*size_x1*size_x2*size_x3/1000./1000.);#ifdef DEBUG  printf ("pointer diff: %ld \n",         (long)((long long)field_in - (long long)field_out));#endif/* set some values and touch the arrays */  weight0 = kernel_weights[1][1][1]=1.;  weight1 = kernel_weights[1][1][0]=0.5;  weight2 = kernel_weights[1][0][0]=0.3;  weight3 = kernel_weights[0][0][0]=0.1;  for (i=0; i<size_x1; i++)   for (j=0; j<size_x2; j++)    for (k=0; k<size_x3; k++) {     field_in[i][j][k] = i*1. + j*0.1 - k*0.3;     field_out[i][j][k] = 0.;  }  timing = timer();  for (irep = 1 ; irep<=repeat; irep++)  {     if (version == 0) 	Stencil_27Point_Isotrope_ORG( field_out, field_in,		 size_x1, size_x2, size_x3, kernel_weights);     if (version == 1) 	Stencil_27Point_Isotrope_O1( field_out, field_in,		 size_x1, size_x2, size_x3, kernel_weights);     if (version == 2) 	Stencil_27Point_Isotrope_O2( field_out, field_in,		 size_x1, size_x2, size_x3, kernel_weights);     if (version == 3) 	Stencil_27Point_Isotrope_O3( field_out, field_in,		 size_x1, size_x2, size_x3, kernel_weights);     if (version == 4) 	Stencil_27Point_Isotrope_O4( field_out, field_in,		 size_x1, size_x2, size_x3, kernel_weights);#ifdef FORTRAN     if (version == -1)        fstencil_org_( field_out[0][0], field_in[0][0], 	        &size_x1, &size_x2, &size_x3, 		&weight0, &weight1, &weight2, &weight3, 		&size_x1, &l_size_x2);        if (version == -2)        fstencil_o2_( field_out[0][0], field_in[0][0], 	        &size_x1, &size_x2, &size_x3, 		&weight0, &weight1, &weight2, &weight3, 		&l_size_x1, &l_size_x2);        if (version == -3)        fstencil_o3_( field_out[0][0], field_in[0][0], 	        &size_x1, &size_x2, &size_x3, 		&weight0, &weight1, &weight2, &weight3, 		&l_size_x1, &l_size_x2);   #endif  }  timing = timer() - timing;    printf ("%4d %10.6f MByte %9.2f sec  per grid-point: %8.3f ns %7.1f cycles\n",        size_x1,          8*size_x1*size_x2*size_x3/1000./1000., 	timing, 	1.E9*timing/((size_x1-2)*(size_x2-2)*(size_x3-2)*repeat), 	1.E9*timing/((size_x1-2)*(size_x2-2)*(size_x3-2)*repeat)/nanoseconds_per_cycle);	#ifdef DEBUG  printf (" %f %f %f %f %f %f %f %f\n", field_out[1][1][1], field_out[1][1][2]			    , field_out[2][2][1], field_out[1][1][3]			    , field_out[1][2][1], field_out[2][1][1]			    , field_out[3][2][1], field_out[3][1][1]);#endif}