Integral Image or Summed Area Table of 2D matrix using CUDA C -
i trying compute summed area table 2d matrix number of rows , columns not equal. have run slight problem code seems function okay rows , columns equal, fails compute last row of final output when rows , columns not equal. problem can't figure out why happening.
basic algorithm integral image/summed area table:
basically, in integral sum every pixel or index element computes sum of matrix elements above , behind it. instance 3x2 input array following elements:
[5, 2| |5, 2| |5, 2]
the integral sum in output array as:
[5, 7| |10, 14| |15, 21]
basically following trying in cuda c:
for(int matrixelement_y_index=0; matrixelement_y_index<=total_rows-1; matrixelement_y_index++) { //matrixelement_x_index , matrixelement_y_index represent (x,y) indices of each matrix element for(int matrixelement_x_index=0; matrixelement_x_index<=total_columns-1; matrixelement_x_index++) { int temp=0; for(int r=0;r<=(matrixelement_y_index);r++) { for(int c=0; c<=matrixelement_x_index;c++) { temp=temp+input[c][r]; } } output[matrixelement_y_index][matrixelement_x_index]=temp; } }
the cuda c code have come far follows:
#include <iostream> #include <cuda_runtime.h> using namespace std; __global__ void image_integral(int *a, int*b, int width_x,int width_y) { // thread ids equal block ids because each blocks contains 1 thread only. int gidx = blockidx.x; int gidy = blockidx.y; int temp=0; if(gidx>=width_x || gidy>=width_y) { //return threads exceed input array's x or y dimension. return; } else //compute integral image or summed area table { // first loop iterates 0 y index of thread represents corresponding element of output/input array. for(int counter=0;counter<=gidy;counter++) { // first loop iterates 0 x index of thread represents corresponding element of output/input array for(int counter_two=0; counter_two<=gidx; counter_two++) { temp = temp+a[counter*width_x+counter_two]; } } } //transfer final result output array b[gidy*width_x+gidx]=temp; } void main() { //m number of rows //n number of columns int m=3,n=2, m_e=0; int total_e=m*n; int widthstep=total_e*sizeof(int); int * matrix_a= (int *)malloc(widthstep); int * matrix_b= (int *)malloc(widthstep); cout<<"enter elements "<< m<<"x"<<n<<" matrix"; for(int r=0;r<=m-1;r++) { for(int c=0; c<=n-1;c++) { cout<<"enter matrix element [ "<<c<<","<<r<<"]"; cin>>m_e; matrix_a[r*m+c]=m_e; matrix_b[r*m+c]=0; } } int * d_matrix_a, * d_matrix_b; cout<<"input:"<<endl; for(int kk=0;kk<=m-1;kk++) { for(int jj=0;jj<=n-1;jj++){ cout<<matrix_a[kk*m+jj]<<" ";} cout<<endl; } cout<<endl; cudamalloc(&d_matrix_a,widthstep); cudamalloc(&d_matrix_b,widthstep); cudamemcpy(d_matrix_a,matrix_a,widthstep,cudamemcpyhosttodevice); cudamemcpy(d_matrix_b,matrix_b,widthstep,cudamemcpyhosttodevice); //creating grid number of blocks equal number of pixels or input matrix elements. //each block contains 1 thread. dim3 grid(m,n); image_integral<<<grid,1>>>(d_matrix_a, d_matrix_b,m,n); cudathreadsynchronize(); cudamemcpy(matrix_b,d_matrix_b,widthstep,cudamemcpydevicetohost); cout<<"the summed area table is: "<<endl; for(int kk=0;kk<=m-1;kk++) { for(int jj=0;jj<=n-1;jj++) cout<<matrix_b[kk*m+jj]<<" "; cout<<endl; } system("pause"); cudafree(d_matrix_a); cudafree(d_matrix_b); free(matrix_a); free(matrix_b); }
many thanks!!
your main problem wrong memory usage , storage. code corrupted heap! rechanged code using row-major ordering, it's used in c/c++.
your first error occurs when write inputs host memory matrix_a[r*m+c]
. because r range 0..m(3)
, c range 0..n(2)
maximum index 2*3+1=7
. matrix have 6 elements - maximum index 5! therefore rechanged matrix accesses.
with changes have fit grid setup, too. it's dim3 grid(n,m);
.
if it's uncertain variable represents or how use it, use representing names them, did in c reference code!
with changes code work me. aware, way of input matrix has changed, too!
above changed complete code: kernel function:
__global__ void image_integral(int *a, int*b, int rowstotal,int colstotal) { // thread ids equal block ids because each blocks contains 1 thread only. int col = blockidx.x; int row = blockidx.y; int temp=0; if(col < colstotal && row < rowstotal) { // first loop iterates 0 y index of thread represents corresponding element of output/input array. for(int r=0;r<=row;r++) { // second loop iterates 0 x index of thread represents corresponding element of output/input array for(int c=0; c<=col; c++) { temp = temp+a[r*colstotal+c]; } } } //transfer final result output array b[row*colstotal+col]=temp; }
the host implementation:
void main() { //m number of rows //n number of columns int m=3,n=2, m_e=0; int total_e=m*n; int widthstep=total_e*sizeof(int); int * matrix_a= (int *)malloc(widthstep); int * matrix_b= (int *)malloc(widthstep); cout<<"enter elements "<< m<<"x"<<n<<" matrix"; for(int r=0;r<m;r++) { for(int c=0; c<n;c++) { cout<<"enter matrix element [ "<<r<<","<<c<<"]"; cin>>m_e; matrix_a[r*n+c]=m_e; matrix_b[r*n+c]=0; } } int * d_matrix_a, * d_matrix_b; cout<<"input:"<<endl; for(int r=0;r<m;r++) { for(int c=0; c<n;c++) { cout << matrix_a[r*n+c]<<" "; } cout << endl; } cout<<endl; cudamalloc(&d_matrix_a,widthstep); cudamalloc(&d_matrix_b,widthstep); cudamemcpy(d_matrix_a,matrix_a,widthstep,cudamemcpyhosttodevice); cudamemcpy(d_matrix_b,matrix_b,widthstep,cudamemcpyhosttodevice); //creating grid number of blocks equal number of pixels or input matrix elements. //each block contains 1 thread. dim3 grid(n,m); image_integral<<<grid,1>>>(d_matrix_a, d_matrix_b,m,n); cudathreadsynchronize(); cudamemcpy(matrix_b,d_matrix_b,widthstep,cudamemcpydevicetohost); cout<<"the summed area table is: "<<endl; for(int r=0;r<m;r++) { for(int c=0; c<n;c++) { cout << matrix_b[r*n+c]<<" "; } cout << endl; } system("pause"); cudafree(d_matrix_a); cudafree(d_matrix_b); free(matrix_a); free(matrix_b); }
Comments
Post a Comment