Integral Image or Summed Area Table of 2D matrix using CUDA C -


i trying compute summed area table 2d matrix number of rows , columns not equal. have run slight problem code seems function okay rows , columns equal, fails compute last row of final output when rows , columns not equal. problem can't figure out why happening.

basic algorithm integral image/summed area table:

basically, in integral sum every pixel or index element computes sum of matrix elements above , behind it. instance 3x2 input array following elements:

 [5, 2|  |5, 2|    |5, 2]  

the integral sum in output array as:

 [5,   7|  |10, 14|    |15, 21]  

basically following trying in cuda c:

for(int matrixelement_y_index=0; matrixelement_y_index<=total_rows-1; matrixelement_y_index++) {     //matrixelement_x_index , matrixelement_y_index represent (x,y) indices of each matrix element     for(int matrixelement_x_index=0; matrixelement_x_index<=total_columns-1; matrixelement_x_index++)     {         int temp=0;           for(int r=0;r<=(matrixelement_y_index);r++)         {             for(int c=0; c<=matrixelement_x_index;c++)             {                 temp=temp+input[c][r];             }         }          output[matrixelement_y_index][matrixelement_x_index]=temp;     } } 

the cuda c code have come far follows:

#include <iostream> #include <cuda_runtime.h>  using namespace std;  __global__ void image_integral(int *a, int*b, int width_x,int width_y) {     // thread ids equal block ids because each blocks contains 1 thread only.     int gidx = blockidx.x;     int gidy = blockidx.y;     int temp=0;      if(gidx>=width_x || gidy>=width_y)     {     //return threads exceed input array's x or y dimension.         return;     }      else     //compute integral image or summed area table     {            // first loop iterates 0 y index of thread represents corresponding element of output/input array.           for(int counter=0;counter<=gidy;counter++)         {             // first loop iterates 0 x index of thread represents corresponding element of output/input array               for(int counter_two=0; counter_two<=gidx; counter_two++)             {                 temp = temp+a[counter*width_x+counter_two];             }         }     }      //transfer final result output array     b[gidy*width_x+gidx]=temp; }  void main() {     //m number of rows     //n number of columns      int m=3,n=2, m_e=0;     int total_e=m*n;     int widthstep=total_e*sizeof(int);      int * matrix_a= (int *)malloc(widthstep);     int * matrix_b= (int *)malloc(widthstep);      cout<<"enter elements "<< m<<"x"<<n<<" matrix";      for(int r=0;r<=m-1;r++)     {         for(int c=0; c<=n-1;c++)         {             cout<<"enter matrix element [ "<<c<<","<<r<<"]";             cin>>m_e;             matrix_a[r*m+c]=m_e;             matrix_b[r*m+c]=0;         }     }      int * d_matrix_a, * d_matrix_b;      cout<<"input:"<<endl;      for(int kk=0;kk<=m-1;kk++)     {         for(int jj=0;jj<=n-1;jj++){             cout<<matrix_a[kk*m+jj]<<" ";}         cout<<endl;     }      cout<<endl;      cudamalloc(&d_matrix_a,widthstep);     cudamalloc(&d_matrix_b,widthstep);      cudamemcpy(d_matrix_a,matrix_a,widthstep,cudamemcpyhosttodevice);     cudamemcpy(d_matrix_b,matrix_b,widthstep,cudamemcpyhosttodevice);      //creating grid number of blocks equal number of pixels or input matrix elements.      //each block contains 1 thread.      dim3 grid(m,n);       image_integral<<<grid,1>>>(d_matrix_a, d_matrix_b,m,n);      cudathreadsynchronize();      cudamemcpy(matrix_b,d_matrix_b,widthstep,cudamemcpydevicetohost);      cout<<"the summed area table is: "<<endl;      for(int kk=0;kk<=m-1;kk++)     {         for(int jj=0;jj<=n-1;jj++)             cout<<matrix_b[kk*m+jj]<<" ";         cout<<endl;     }      system("pause");      cudafree(d_matrix_a);     cudafree(d_matrix_b);     free(matrix_a);     free(matrix_b); } 

many thanks!!

your main problem wrong memory usage , storage. code corrupted heap! rechanged code using row-major ordering, it's used in c/c++.

your first error occurs when write inputs host memory matrix_a[r*m+c]. because r range 0..m(3) , c range 0..n(2) maximum index 2*3+1=7. matrix have 6 elements - maximum index 5! therefore rechanged matrix accesses.

with changes have fit grid setup, too. it's dim3 grid(n,m);.

if it's uncertain variable represents or how use it, use representing names them, did in c reference code!

with changes code work me. aware, way of input matrix has changed, too!

above changed complete code: kernel function:

__global__ void image_integral(int *a, int*b, int rowstotal,int colstotal) {     // thread ids equal block ids because each blocks contains 1 thread only.     int col = blockidx.x;     int row = blockidx.y;     int temp=0;      if(col < colstotal && row < rowstotal)     {         // first loop iterates 0 y index of thread represents corresponding element of output/input array.           for(int r=0;r<=row;r++)         {             // second loop iterates 0 x index of thread represents corresponding element of output/input array               for(int c=0; c<=col; c++)             {                 temp = temp+a[r*colstotal+c];             }         }     }      //transfer final result output array     b[row*colstotal+col]=temp; } 

the host implementation:

void main() {     //m number of rows     //n number of columns      int m=3,n=2, m_e=0;     int total_e=m*n;     int widthstep=total_e*sizeof(int);      int * matrix_a= (int *)malloc(widthstep);     int * matrix_b= (int *)malloc(widthstep);      cout<<"enter elements "<< m<<"x"<<n<<" matrix";      for(int r=0;r<m;r++)     {         for(int c=0; c<n;c++)         {             cout<<"enter matrix element [ "<<r<<","<<c<<"]";             cin>>m_e;             matrix_a[r*n+c]=m_e;             matrix_b[r*n+c]=0;         }     }      int * d_matrix_a, * d_matrix_b;      cout<<"input:"<<endl;      for(int r=0;r<m;r++)     {         for(int c=0; c<n;c++)         {             cout << matrix_a[r*n+c]<<" ";         }         cout << endl;     }      cout<<endl;      cudamalloc(&d_matrix_a,widthstep);     cudamalloc(&d_matrix_b,widthstep);      cudamemcpy(d_matrix_a,matrix_a,widthstep,cudamemcpyhosttodevice);     cudamemcpy(d_matrix_b,matrix_b,widthstep,cudamemcpyhosttodevice);      //creating grid number of blocks equal number of pixels or input matrix elements.      //each block contains 1 thread.      dim3 grid(n,m);      image_integral<<<grid,1>>>(d_matrix_a, d_matrix_b,m,n);      cudathreadsynchronize();      cudamemcpy(matrix_b,d_matrix_b,widthstep,cudamemcpydevicetohost);      cout<<"the summed area table is: "<<endl;      for(int r=0;r<m;r++)     {         for(int c=0; c<n;c++)         {             cout << matrix_b[r*n+c]<<" ";         }         cout << endl;     }      system("pause");      cudafree(d_matrix_a);     cudafree(d_matrix_b);     free(matrix_a);     free(matrix_b); } 

Comments

Popular posts from this blog

c# - How to get the current UAC mode -

postgresql - Lazarus + Postgres: incomplete startup packet -

javascript - Ajax jqXHR.status==0 fix error -