您好,欢迎访问三七文档
实验题目:程序性能调优实验要求:本次实验,要求针对每个函数、每个人均至少写出3种优化版本、并根据driver报告的结果进行性能分析实验目的:理解编译器,学习程序优化,从优化程序代码和程序执行速度两方面着手。实验环境:WIN764位、ubuntu,VMwareworkstation,实验内容及操作步骤:将下载下来的kernels.c中的rotate、smooth函数进行优化。本实验的实验原理是通过循环展开、cache友好、替换变量等手段来实现程序优化。实验过程及分析:由于优化代码较长,就不进行截图。1.Naive_rotate1)原代码charnaive_rotate_descr[]=naive_rotate:Naivebaselineimplementation;voidnaive_rotate(intdim,pixel*src,pixel*dst){inti,j;for(i=0;idim;i++)for(j=0;jdim;j++)dst[RIDX(dim-1-j,i,dim)]=src[RIDX(i,j,dim)];}2)分析:这段代码的作用就是将所有的像素进行行列调位、导致整幅图画进行了90度旋转。P从defs.h中可以找到#defineRIDX(i,j,n)((i)*(n)+(j))。这段代码本来很短,但是从cache友好性来分析,这个代码的效率机会很低,所以按照cache的大小,应在存储的时候进行32个像素依次存储(列存储)。做到cache友好这样就可以可以大幅度提高效率。#includestdio.h#includestdlib.h#includedefs.hteam_tteam={201308060228,/*队名*/201308060228,/*序号*/747660816@qq.com,/*邮箱*/,/*Secondmemberfullname(leaveblankifnone)*//*Secondmemberemailaddr(leaveblankifnone)*/};/**naive_rotate-Thenaivebaselineversionofrotate*/charnaive_rotate_descr[]=naive_rotate:Naivebaselineimplementation;voidnaive_rotate(intdim,pixel*src,pixel*dst){inti,j;for(i=0;idim;i++)for(j=0;jdim;j++)dst[RIDX(dim-1-j,i,dim)]=src[RIDX(i,j,dim)];}/**rotate-Yourcurrentworkingversionofrotate*IMPORTANT:Thisistheversionyouwillbegradedon*/charrotate_descr[]=rotate:Currentworkingversion,usingpointerratherthancomputingaddress;voidrotate(intdim,pixel*src,pixel*dst){inti;intj;inttmp1=dim*dim;inttmp2=dim*31;inttmp3=tmp1-dim;inttmp4=tmp1+32;inttmp5=dim+31;dst+=tmp3;for(i=0;idim;i+=32){for(j=0;jdim;j++){*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;src++;src-=tmp2;dst-=tmp5;}src+=tmp2;dst+=tmp4;}}/**********************************************************************register_rotate_functions-Registerallofyourdifferentversions*oftherotatekernelwiththedriverbycallingthe*add_rotate_function()foreachtestfunction.Whenyourunthe*driverprogram,itwilltestandreporttheperformanceofeach*registeredtestfunction.*********************************************************************/charrotate_descr_v1[]=rotate_v1:version1breakinto4*4blocks;voidrotate_v1(intdim,pixel*src,pixel*dst){inti,j,ii,jj;for(ii=0;iidim;ii+=4)for(jj=0;jjdim;jj+=4)for(i=ii;iii+4;i++)for(j=jj;jjj+4;j++)dst[RIDX(dim-1-j,i,dim)]=src[RIDX(i,j,dim)];}charrotate_descr_v2[]=rotate_v2:version2breakinto32*32blocks;voidrotate_v2(intdim,pixel*src,pixel*dst){inti,j,ii,jj;for(ii=0;iidim;ii+=32)for(jj=0;jjdim;jj+=32)for(i=ii;iii+32;i++)for(j=jj;jjj+32;j++)dst[RIDX(dim-1-j,i,dim)]=src[RIDX(i,j,dim)];}charrotate_descr_v3[]=rotate_v3:version3breakinto4*1blockswith4parallelpaths;voidrotate_v3(intdim,pixel*src,pixel*dst){inti;intj;inttmp=(dim-1)*dim;pixel*src_op;pixel*dst_op;for(i=0;idim;i+=4){pixel*src_op_cpy=src+i*dim;pixel*dst_op_cpy=dst+tmp+i;src_op=src_op_cpy;dst_op=dst_op_cpy;for(j=0;jdim;j++){*dst_op=*src_op;dst_op++;src_op+=dim;*dst_op=*src_op;dst_op++;src_op+=dim;*dst_op=*src_op;dst_op++;src_op+=dim;*dst_op=*src_op;dst_op++;src_op+=dim;src_op_cpy++;dst_op_cpy-=dim;src_op=src_op_cpy;dst_op=dst_op_cpy;}}}2.Naive_smooth1)原代码charnaive_smooth_descr[]=naive_smooth:Naivebaselineimplementation;voidnaive_smooth(intdim,pixel*src,pixel*dst){inti,j;for(i=0;idim;i++)for(j=0;jdim;j++)dst[RIDX(i,j,dim)]=avg(dim,i,j,src);}2)分析这段代码很多次地调用avg函数,而avg函数内也频繁调用initialize_pixel_sum、accumulate_sum、assign_sum_to_pixel这几个函数,且含有2层for循环。虽然会以损害程序的模块性为代价,但消除函数调用的时间开销,得到的代码运行速度会快得多。所以,需要改写代码,不调用avg函数。Smooth函数处理分为以下3部分,一.主体内部,由9点求平均值;二.4个角,由4点求平均值;三.4条边界,由6点求平均值。由图片的顶部开始处理,再上边界,顺序处理下来,其中在处理左边界时,for循环处理一行主体部分3)优化代码charsmooth_descr_v1[]=smooth_v1:withlessfunccallandgrosslysimplifiedcalculationforcentralparts;voidsmooth_v1(intdim,pixel*src,pixel*dst){inti,j,ii,jj;pixel_sumsum;pixelcurrent_pixel,cp;for(j=0;jdim;j++){dst[RIDX(0,j,dim)]=avg(dim,0,j,src);dst[RIDX(dim-1,j,dim)]=avg(dim,dim-1,j,src);}for(i=0;idim;i++){dst[RIDX(i,0,dim)]=avg(dim,i,0,src);dst[RIDX(i,dim-1,dim)]=avg(dim,i,dim-1,src);}for(i=1;idim-1;i++)for(j=1;jdim-1;j++){sum.red=sum.green=sum.blue=0;for(ii=max(i-1,0);ii=min(i+1,dim-1);ii++)for(jj=max(j-1,0);jj=min(j+1,dim-1);jj++){cp=src[RIDX(ii,jj,dim)];sum.red+=cp.red;sum.green+=cp.green;sum.blue+=cp.blue;}current_pixel.red=sum.red/9;current_pixel.green=sum.green/9;current_pixel.blue=sum.blue/9;dst[RIDX(i,j,dim)]=current_pixel;}}charsmooth_descr_v2[]=smooth_v2:dividesrcinto3partsanduse3pointersforsmoothingc
本文标题:程序性能调优
链接地址:https://www.777doc.com/doc-2150904 .html