End-user products => VAST/AltiVec => Code examples

VAST/AltiVec Code Examples

Example 1 -- Aligned, no loop cleanup.

(NOTE: these code examples are for advanced users who want to see the level of code VAST generates internally. Typical users of VAST/AltiVec will probably not be interested in this level of detail, nor do they need to be. This intermediate code is normally fed directly to the backend compiler and does not need to be examined by the user.)

The code below is a very simple example where the loop iteration count (64) is a multiple of the vector size (so there is no loop cleanup needed) and we have specified that the arrays are already aligned (with the -Valigned switch).

The generated code is automatically unrolled by a factor of four. At the top of the loop the scalar "x" is placed into a vector register. Then all the vector data is loaded into registers from memory, four vector multiply-add operations are done, and the vector results are stored back into memory.

Original code:

 
extern float a[128], b[128], x;
void simple ()
{
 int i;
 for (i=0; i<64; i++)
   a[i] = a[i]*x + b[i];
}

VAST output (intermediate C code):

 
 /* Translated by Pacific-Sierra Research VAST-C AltiVec 7.4 H 14:45:33 8/ 8/00 */
 /* Switches: -Valigned */
 extern float a[128], b[128], x;
 void  simple( )
 {
    int j1, j2, j3, j4, j5, j6, j7;
    int i;
    {
       {
          vector float a1v, b1v, r1v;
          vector float r2v = (vector float )(0);
          vector float a4v, b4v;
          vector float a3v, b3v;
          vector float a2v, b2v;
          *((float *)&r2v) = x;
          r1v = vec_splat(r2v, 0);
          for ( j1 = 0; j1 < (64 - 4 * 4) + 1; j1 += 4 * 4 )
          {
             j3 = j1 * sizeof(int );
             j2 = j3 + 4 * sizeof(int );
             a1v = vec_ld(j3, &a[0]);
             b1v = vec_ld(j3, &b[0]);
             a2v = vec_ld(j3 + 16, &a[0]);
             b2v = vec_ld(j3 + 16, &b[0]);
             a3v = vec_ld(j3 + 32, &a[0]);
             b3v = vec_ld(j3 + 32, &b[0]);
             a4v = vec_ld(j3 + 48, &a[0]);
             b4v = vec_ld(j3 + 48, &b[0]);
             a1v = vec_madd(r1v, a1v, b1v);
             vec_st(a1v, j3, &a[0]);
             a2v = vec_madd(r1v, a2v, b2v);
             vec_st(a2v, j3 + 16, &a[0]);
             a3v = vec_madd(r1v, a3v, b3v);
             vec_st(a3v, j3 + 32, &a[0]);
             a4v = vec_madd(r1v, a4v, b4v);
             vec_st(a4v, j3 + 48, &a[0]);
          }
       }
    }
 }

Example 2 -- General case.

This is the same example as above, except the iteration count is unknown at compile time and we have not specified that the arrays are aligned. Even though the generated code is considerably more complicated, it still very fast compared with the original. Remember, normally you do not need to look at this code if you do not want to -- it's just the intermediate code that is passed on to the compiler by the driver.

In the translated code, the vec_perm's are used to align the data prior to vector operations.

Original code:


extern float a[99], b[99], x;
extern int n;
void simple ()
{
 int i;
 for (i=0; i < n; i++)
   a[i] = a[i]*x + b[i];
}

VAST output (intermediate C code):


 /* Translated by Pacific-Sierra Research VAST-C AltiVec 7.4 H 15:29:27 8/ 8/00 */
 extern float a[99], b[99], x;
 extern int n;
 void  simple( )
 {
    int j1, j2, j3, j4, j5, j6, j7;
    int i;
    {
       int j4s;
       if ( n > 0 )
       {
          {
             vector float a1v, b1v, r1v;
             vector float a18v, a19v;
             vector float r2v = (vector float )(0);
             vector float a15v, a16v, b9v, b10v, a17v;
             vector float a12v, a13v, b7v, b8v, a14v;
             vector float a9v, a10v, b5v, b6v, a11v;
             vector float a2v, a3v;
             vector unsigned char a4v = vec_lvsl(0, &a[0]);
             vector float b2v, b3v;
             vector unsigned char b4v = vec_lvsl(0, &b[0]);
             vector float a5v, a6v;
             vector unsigned char a7v = vec_lvsr(0, &a[0]);
             vector unsigned char a8v = vec_lvsl(0, &a[0]);
             static vector unsigned long j1v[3] =  { (vector unsigned long
              )(0, 0XFFFFFFFF, 0XFFFFFFFF, 0XFFFFFFFF), (
             vector unsigned long )(0, 0, 0XFFFFFFFF, 0XFFFFFFFF), (
             vector unsigned long )(0, 0, 0, 0XFFFFFFFF) } ;
             *((float *)&r2v) = x;
             r1v = vec_splat(r2v, 0);
             j6 = n;
             a2v = vec_ld(0, &a[0]);
             b2v = vec_ld(0, &b[0]);
             a5v = vec_perm(a2v, a2v, a8v);
             for ( j1 = 0; j1 < (j6 - 4 * 4) + 1; j1 += 4 * 4 )
             {
                j3 = j1 * sizeof(int );
                j2 = j3 + 4 * sizeof(int );
                a3v = vec_ld(j2, &a[0]);
                b3v = vec_ld(j2, &b[0]);
                a9v = vec_ld(j2 + 16, &a[0]);
                b5v = vec_ld(j2 + 16, &b[0]);
                a12v = vec_ld(j2 + 32, &a[0]);
                b7v = vec_ld(j2 + 32, &b[0]);
                a1v = vec_perm(a2v, a3v, a4v);
                a2v = vec_ld(j2 + 48, &a[0]);
                b1v = vec_perm(b2v, b3v, b4v);
                b2v = vec_ld(j2 + 48, &b[0]);
                a1v = vec_madd(r1v, a1v, b1v);
                a6v = vec_perm(a5v, a1v, a7v);
                vec_st(a6v, j3, &a[0]);
                a10v = vec_perm(a3v, a9v, a4v);
                b6v = vec_perm(b3v, b5v, b4v);
                a10v = vec_madd(r1v, a10v, b6v);
                a11v = vec_perm(a1v, a10v, a7v);
                vec_st(a11v, j3 + 16, &a[0]);
                a13v = vec_perm(a9v, a12v, a4v);
                b8v = vec_perm(b5v, b7v, b4v);
                a13v = vec_madd(r1v, a13v, b8v);
                a14v = vec_perm(a10v, a13v, a7v);
                vec_st(a14v, j3 + 32, &a[0]);
                a16v = vec_perm(a12v, a2v, a4v);
                b10v = vec_perm(b7v, b2v, b4v);
                a16v = vec_madd(r1v, a16v, b10v);
                a17v = vec_perm(a13v, a16v, a7v);
                vec_st(a17v, j3 + 48, &a[0]);
                a5v = a16v;
             }
             if ( (j6 & (4 * 4 - 1)) == 0 )
                a1v = a5v;
                else
                {
                   for ( ; j1 < (j6 - 4) + 1; j1 += 4 )
                   {
                      j3 = j1 * sizeof(int );
                      j2 = j3 + 4 * sizeof(int );
                      a3v = vec_ld(j2, &a[0]);
                      a1v = vec_perm(a2v, a3v, a4v);
                      b3v = vec_ld(j2, &b[0]);
                      b1v = vec_perm(b2v, b3v, b4v);
                      a1v = vec_madd(r1v, a1v, b1v);
                      a6v = vec_perm(a5v, a1v, a7v);
                      vec_st(a6v, j3, &a[0]);
                      a5v = a1v;
                      a2v = a3v;
                      b2v = b3v;
                   }
             }
             j4s = ((int )&a[0] & 15) >> 2;
             j4 = j6 & 3;
             if ( j4 > 0 )
             {
                j3 = j1 * sizeof(int );
                j2 = j3 + 4 * sizeof(int );
                a3v = vec_ld(j2, &a[0]);
                a1v = vec_perm(a2v, a3v, a4v);
                b3v = vec_ld(j2, &b[0]);
                b1v = vec_perm(b2v, b3v, b4v);
                a1v = vec_madd(r1v, a1v, b1v);
                a6v = vec_perm(a5v, a1v, a7v);
                j7 = j4s + j4;
                if ( j7 >= 4 )
                {
                   a18v = vec_ld(j3 + 16, &a[0]);
                   vec_st(a6v, j3, &a[0]);
                   if ( j7 > 4 )
                   {
                      a6v = vec_perm(a1v, a1v, a7v);
                      a19v = vec_sel(a6v, a18v, j1v[j7-5]);
                      vec_st(a19v, j3 + 16, &a[0]);
                   }
                }
                else
                {
                   a18v = vec_ld(j3, &a[0]);
                   a19v = vec_sel(a6v, a18v, j1v[j7-1]);
                   vec_st(a19v, j3, &a[0]);
                }
                a5v = a1v;
             }
             else
             {
                j3 = j1 * sizeof(int );
                if ( j4s )
                {
                   a6v = vec_perm(a1v, a1v, a7v);
                   for ( j5 = 0; j5 < j4s; j5++ )
                      vec_ste(a6v, j3 + (j5 - j4s) * sizeof(float ), &a[0] );
                }
             }
          }
       }
    }
 }

Now aren't you glad you can get VAST to generate this code for you and you don't have to do it yourself? :^)

Return to Top.

Home Contact Legal

Copyright 2003, 2005 Crescent Bay Software Corp.