OpenCL Example and tutorial

Moderator: dar

OpenCL Example and tutorial

Postby dkhenry » Mon Dec 17, 2012 9:42 pm

What is the go to OpenCL example for the Parallella.? Also can we use OpenCL in the SDK ?
dkhenry
 
Posts: 1
Joined: Mon Dec 17, 2012 3:26 am

Re: OpenCL Example and tutorial

Postby cordite » Mon Dec 17, 2012 10:10 pm

Could you change the title so it seems to be more of a question more than a statement that this thread is one?
cordite
 
Posts: 8
Joined: Mon Dec 17, 2012 3:26 am

Re: OpenCL Example and tutorial

Postby jar » Tue Dec 18, 2012 4:41 am

Take a look at the OpenCL Matrix Multiplication example that demonstrates some of the OpenCL kernel extensions that have been implemented to support the Epiphany architecture.

Also, there are many OpenCL examples available in the COPRTHR SDK
User avatar
jar
 
Posts: 284
Joined: Mon Dec 17, 2012 3:27 am

Re: OpenCL Example and tutorial

Postby dar » Wed Dec 19, 2012 11:55 pm

Here is a very simple example calculating Mandelbrot Set on a 16-core Epiphany processor, using OpenCL kernel and STDCL host code. Not optimized. Was used to generate the attached image. Perhaps it provides a glimpse into how the parallel cores can be utilized on Parallella board by leveraging OpenCL.

Code: Select all
#define set_red(n) (5*n  )
#define set_green(n) (20*n  )
#define set_blue(n) 0

__kernel void mandel_kern(
   int iterations,
   int width,
   float startx,
   float starty,
   float dx,
   float dy,
   __global uchar* pixels
)
{
   unsigned char line[3*WIDTH];

   int j = get_global_id(0);

   int i;
   for (i = 0; i < width; i++) {

      float x = startx + i*dx;
      float y = starty + j*dy;

      float r = x; float s = y;

      int n;      
      for (n = 0; n < iterations; n++) {

         float nextr = ((r * r) - (s * s)) + x;
         float nexts = (2 * r * s) + y;
         r = nextr;
         s = nexts;
         
         if ((r * r) + (s * s) > 4) break;

      }

      if (n == iterations) n=0;

      line[(i * 3) + 0 ] = min(255,set_red(n));
      line[(i * 3) + 1 ] = min(255,set_green(n));
      line[(i * 3) + 2 ] = min(255,set_blue(n));   

   }

   memcopy(&pixels[(j * width * 3)],line,3*WIDTH);

}



Code: Select all
// The modifications porting this code to OpenCL are
// Copyright (c) 2012 Brown Deer Technology, LLC.
//
// Mandelbrot.c
// Written by User:Evercat
//
// This draws the Mandelbrot set and spits out a .bmp file.
// Should be quite portable (endian issues have been taken
// care of, for example)
//
// Released under the GNU Free Documentation License
// or the GNU Public License, whichever you prefer:
// 9 February, 2004.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdcl.h>

#include "Timer.h"

#define OUTFILE "mandelbrot.bmp"

#define WIDTH 1024
#define HEIGHT 768

#define CENTRE_X -0.5
#define CENTRE_Y 0
#define ZOOM 300

#define ITERATIONS 1024  // Higher is more detailed, but slower...

// Plotting functions and parameters...

#define bailoutr(n) (5*n  )
#define bailoutg(n) (20*n  )
#define bailoutb(n) 0

#define min(a,b) ((a<b)?a:b)

// Colours for the set itself...

#define IN_SET_R 0
#define IN_SET_G 0
#define IN_SET_B 0

void drawbmp(int width, int height, unsigned char* pixels, char * filename);

/////////////////////////////////// MAIN PROGRAM ///////////////////////////////////

int main (void)
{

   float x; float r; float nextr;
   float y; float s; float nexts;
   int n;
   float startx; float endx;
   float starty; float endy;
   float dx; float dy;
   float dx_over_width,dy_over_height;

   int iterations = ITERATIONS;
   int width = WIDTH;
   int height = HEIGHT;

   cl_uchar* pixels = (cl_uchar*) clmalloc(stdacc,width * height * 3,0);

   startx = CENTRE_X - ((float) WIDTH / (ZOOM * 2));
   endx = CENTRE_X + ((float) WIDTH / (ZOOM * 2));

   starty = CENTRE_Y - ((float) HEIGHT / (ZOOM * 2));
   endy = CENTRE_Y + ((float) HEIGHT / (ZOOM * 2));

   printf("\n   Plotting from (%f, %f) to (%f, %f)\n",
      startx, starty, endx, endy);

   dx = endx - startx;
   dy = endy - starty;
   dx_over_width = dx / width;
   dy_over_height = dy / height;

   cl_kernel krn = clsym(stdacc,0,"mandel_kern",CLLD_NOW);

   clndrange_t ndr = clndrange_init1d( 0,height, 16);

   clforka(stdacc,0,krn,&ndr,CL_EVENT_WAIT,
      iterations,width,startx,starty,dx_over_width,dy_over_height,pixels);

   clmsync(stdacc,0,pixels,CL_MEM_HOST|CL_EVENT_WAIT);


   drawbmp(width,height,pixels, OUTFILE);
   printf("\n   Saved to %s. Done.\n", OUTFILE);

   clfree(pixels);

   return 0;
}


void drawbmp (int width, int height, unsigned char* pixels, char * filename) {

   unsigned int headers[13];
   FILE * outfile;
   int extrabytes;
   int paddedsize;
   int x; int y; int n;

   extrabytes = 4 - ((width * 3) % 4); // How many bytes of padding to add to
                                       // eachhorizontal line - the size of
                                       // which must be a multiple of 4 bytes.
   if (extrabytes == 4)
      extrabytes = 0;

   paddedsize = ((width * 3) + extrabytes) * height;

   // Headers...
                     
   headers[0]  = paddedsize + 54;      // bfSize (whole file size)
   headers[1]  = 0;                    // bfReserved (both)
   headers[2]  = 54;                   // bfOffbits
   headers[3]  = 40;                   // biSize
   headers[4]  = width;  // biWidth
   headers[5]  = height; // biHeight
                                       // 6 will be written directly...
   headers[7]  = 0;                    // biCompression
   headers[8]  = paddedsize;           // biSizeImage
   headers[9]  = 0;                    // biXPelsPerMeter
   headers[10] = 0;                    // biYPelsPerMeter
   headers[11] = 0;                    // biClrUsed
   headers[12] = 0;                    // biClrImportant

   outfile = fopen (filename, "wb");

   // Headers begin...
   // When printing ints and shorts, write out 1 character at time to
   // avoid endian issues.

   fprintf (outfile, "BM");

   for (n = 0; n <= 5; n++)
   {
      fprintf(outfile, "%c", headers[n] & 0x000000FF);
      fprintf(outfile, "%c", (headers[n] & 0x0000FF00) >> 8);
      fprintf(outfile, "%c", (headers[n] & 0x00FF0000) >> 16);
      fprintf(outfile, "%c", (headers[n] & (unsigned int) 0xFF000000) >> 24);
   }

   // These next 4 characters are for the biPlanes and biBitCount fields.

   fprintf(outfile, "%c", 1);
   fprintf(outfile, "%c", 0);
   fprintf(outfile, "%c", 24);
   fprintf(outfile, "%c", 0);

   for (n = 7; n <= 12; n++)
   {
      fprintf(outfile, "%c", headers[n] & 0x000000FF);
      fprintf(outfile, "%c", (headers[n] & 0x0000FF00) >> 8);
      fprintf(outfile, "%c", (headers[n] & 0x00FF0000) >> 16);
      fprintf(outfile, "%c", (headers[n] & (unsigned int) 0xFF000000) >> 24);
   }

   // Headers done, now write the data...

   for (y = height - 1; y >= 0; y--)  // BMPs are written bottom to top.
   {
      for (x = 0; x <= width - 1; x++)
      {
         // Also, it's written in (b,g,r) format...
   
         fprintf(outfile, "%c", pixels[(x * 3) + 2 + (y * width * 3)]);
         fprintf(outfile, "%c", pixels[(x * 3) + 1 + (y * width * 3)]);
         fprintf(outfile, "%c", pixels[(x * 3) + 0 + (y * width * 3)]);
      }
      if (extrabytes) // See above - BMP lines must be of lengths divisible by 4
      {
         for (n = 1; n <= extrabytes; n++)
         {
            fprintf(outfile, "%c", 0);
         }
      }
   }

   fclose (outfile);
   return;
}
Attachments
mandelbrot-E16G.jpg
mandelbrot-E16G.jpg (106.1 KiB) Viewed 10720 times
dar
 
Posts: 88
Joined: Mon Dec 17, 2012 3:26 am

Re: OpenCL Example and tutorial

Postby Dade » Thu Dec 20, 2012 9:55 am

dar wrote:Here is a very simple example calculating Mandelbrot Set on a 16-core Epiphany processor, using OpenCL kernel and STDCL host code. Not optimized. Was used to generate the attached image. Perhaps it provides a glimpse into how the parallel cores can be utilized on Parallella board by leveraging OpenCL.


Thanks for sharing. I'm a bit interested to know why the kernel has been written in that way. It would run very very very slow on any GPU so I assume there are Epiphany related reasons to use that particular scheme.

I guess line[] array is stored on local Epiphany memory (a copy for each core) while pixels[] array is stored on ARM memory. Why to not directly write the result on pixels[] array ? On Epiphany is a lot faster to transfer a large chunk of memory (i.e. line[]) instead of a few bytes at time (i.e. n) ?

A Mandelbrot Set renderer was the very first OpenCL program I wrote few years ago (http://davibu.interfree.it/opencl/mande ... elGPU.html). It was written with the intent to compare Native C Vs OpenCL code. I was very impressed by the result: a ATI HD4870 was more than 60 times faster than a single core of a Intel Q6600.

Have you tried to run your kernel on ARM OpenCL CPU device ? How much faster is Epiphany than the ARM ?
User avatar
Dade
 
Posts: 26
Joined: Sun Dec 16, 2012 8:59 pm

Re: OpenCL Example and tutorial

Postby dar » Sat Dec 22, 2012 4:07 am

The kernel is certainly not optimized, but maybe 10-15 min of thought was given to making the kernel match with Epiphany to get decent performance. Have not run this on ARM. It was faster than CPU used with the earlier eval kit.

Your question raises a good point that programmers new to OpenCL (and in general perhaps) should be aware of - being able to express an algorithm in a common API does not mean that the precise form of your code will run well (or even work) on different hardware platforms. This is a bit of a fantasy. Code is portable across GPUs to the extent they have similar architectural features, and we begin to believe the fantasy sometimes. Performant code must be tuned for a given architecture.

With respect to the Mandelbrot kernel, you are correct, it was designed to write back a line to DRAM. There are other factors less obvious. Epiphany cores are more like a CPU than a GPU, i.e., its not SIMD or SIMT even though it can support both models. Its not a multi-threaded architecture, the cores are scalar, and it has a memory architecture different from a GPU. This impacts the code you write. For example, with a GPU you try to keep thousands of threads "in flight" and you have to pay attention to certain memory alignment rules between threads. With Epiphany I believe you gain nothing by keeping more threads in flight than the number of physical cores, and the memory rules are different.

The intent is to eventually provide a programming "best practices" guide for Parallella to help explain some of these things by example.
dar
 
Posts: 88
Joined: Mon Dec 17, 2012 3:26 am

Re: OpenCL Example and tutorial

Postby Gedece » Sat Dec 22, 2012 11:43 am

That "Best Practices" guide sounds interesting, please post an announcement once it becomes available, it will help a lot of us to understand things better.
User avatar
Gedece
 
Posts: 23
Joined: Mon Dec 17, 2012 3:18 am
Location: Buenos Aires, Argentina

Re: OpenCL Example and tutorial

Postby Transcendental » Sat Dec 22, 2012 11:37 pm

Gedece wrote:That "Best Practices" guide sounds interesting, please post an announcement once it becomes available, it will help a lot of us to understand things better.


And let's not forget that there is a Parallella eBook on the way for those that ordered it as part of their Kickstarter reward bundle.

When this eBook is going to arrive, I don't know.
User avatar
Transcendental
 
Posts: 49
Joined: Mon Dec 17, 2012 1:41 am

Re: OpenCL Example and tutorial

Postby Lord_Rafa » Tue May 14, 2013 1:36 am

Hello,

I need help, I was trying to run this example but I got this error:

coprthr-1.5.0-RC1 (Marathon)
out: e_open(): mmap failure.
out: e_alloc(): mmap failure.
[15439] clmesg ERROR: device.c(257): e_alloc returned 1
[15439] clmesg info: cmdsched.c(88): cmdqx0: run
[15439] clmesg info: cmdsched.c(88): cmdqx0: run
Segmentation fault

first I compiled with clcc the opencl code to get a .o file and after I compiled and linked with gcc
Lord_Rafa
 
Posts: 3
Joined: Tue May 14, 2013 1:34 am

Re: OpenCL Example and tutorial

Postby ysapir » Tue May 14, 2013 6:18 am

mmap() failure usually (but not necessarily) means that you are not running as superuser.
User avatar
ysapir
 
Posts: 393
Joined: Tue Dec 11, 2012 7:05 pm

Next

Return to OpenCL

Who is online

Users browsing this forum: No registered users and 1 guest

cron