程序性能调优

实验题目：程序性能调优实验要求：本次实验，要求针对每个函数、每个人均至少写出3种优化版本、并根据driver报告的结果进行性能分析实验目的:理解编译器，学习程序优化，从优化程序代码和程序执行速度两方面着手。实验环境：WIN764位、ubuntu,VMwareworkstation,实验内容及操作步骤：将下载下来的kernels.c中的rotate、smooth函数进行优化。本实验的实验原理是通过循环展开、cache友好、替换变量等手段来实现程序优化。实验过程及分析：由于优化代码较长，就不进行截图。1.Naive_rotate1）原代码charnaive_rotate_descr[]=naive_rotate:Naivebaselineimplementation;voidnaive_rotate(intdim,pixel*src,pixel*dst){inti,j;for(i=0;idim;i++)for(j=0;jdim;j++)dst[RIDX(dim-1-j,i,dim)]=src[RIDX(i,j,dim)];}2）分析：这段代码的作用就是将所有的像素进行行列调位、导致整幅图画进行了90度旋转。P从defs.h中可以找到#defineRIDX(i,j,n)((i)*(n)+(j))。这段代码本来很短，但是从cache友好性来分析，这个代码的效率机会很低，所以按照cache的大小，应在存储的时候进行32个像素依次存储（列存储）。做到cache友好这样就可以可以大幅度提高效率。#includestdio.h#includestdlib.h#includedefs.hteam_tteam={201308060228,/*队名*/201308060228,/*序号*/747660816@qq.com,/*邮箱*/,/*Secondmemberfullname(leaveblankifnone)*//*Secondmemberemailaddr(leaveblankifnone)*/};/**naive_rotate-Thenaivebaselineversionofrotate*/charnaive_rotate_descr[]=naive_rotate:Naivebaselineimplementation;voidnaive_rotate(intdim,pixel*src,pixel*dst){inti,j;for(i=0;idim;i++)for(j=0;jdim;j++)dst[RIDX(dim-1-j,i,dim)]=src[RIDX(i,j,dim)];}/**rotate-Yourcurrentworkingversionofrotate*IMPORTANT:Thisistheversionyouwillbegradedon*/charrotate_descr[]=rotate:Currentworkingversion,usingpointerratherthancomputingaddress;voidrotate(intdim,pixel*src,pixel*dst){inti;intj;inttmp1=dim*dim;inttmp2=dim*31;inttmp3=tmp1-dim;inttmp4=tmp1+32;inttmp5=dim+31;dst+=tmp3;for(i=0;idim;i+=32){for(j=0;jdim;j++){*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;dst++;src+=dim;*dst=*src;src++;src-=tmp2;dst-=tmp5;}src+=tmp2;dst+=tmp4;}}/**********************************************************************register_rotate_functions-Registerallofyourdifferentversions*oftherotatekernelwiththedriverbycallingthe*add_rotate_function()foreachtestfunction.Whenyourunthe*driverprogram,itwilltestandreporttheperformanceofeach*registeredtestfunction.*********************************************************************/charrotate_descr_v1[]=rotate_v1:version1breakinto4*4blocks;voidrotate_v1(intdim,pixel*src,pixel*dst){inti,j,ii,jj;for(ii=0;iidim;ii+=4)for(jj=0;jjdim;jj+=4)for(i=ii;iii+4;i++)for(j=jj;jjj+4;j++)dst[RIDX(dim-1-j,i,dim)]=src[RIDX(i,j,dim)];}charrotate_descr_v2[]=rotate_v2:version2breakinto32*32blocks;voidrotate_v2(intdim,pixel*src,pixel*dst){inti,j,ii,jj;for(ii=0;iidim;ii+=32)for(jj=0;jjdim;jj+=32)for(i=ii;iii+32;i++)for(j=jj;jjj+32;j++)dst[RIDX(dim-1-j,i,dim)]=src[RIDX(i,j,dim)];}charrotate_descr_v3[]=rotate_v3:version3breakinto4*1blockswith4parallelpaths;voidrotate_v3(intdim,pixel*src,pixel*dst){inti;intj;inttmp=(dim-1)*dim;pixel*src_op;pixel*dst_op;for(i=0;idim;i+=4){pixel*src_op_cpy=src+i*dim;pixel*dst_op_cpy=dst+tmp+i;src_op=src_op_cpy;dst_op=dst_op_cpy;for(j=0;jdim;j++){*dst_op=*src_op;dst_op++;src_op+=dim;*dst_op=*src_op;dst_op++;src_op+=dim;*dst_op=*src_op;dst_op++;src_op+=dim;*dst_op=*src_op;dst_op++;src_op+=dim;src_op_cpy++;dst_op_cpy-=dim;src_op=src_op_cpy;dst_op=dst_op_cpy;}}}2.Naive_smooth1）原代码charnaive_smooth_descr[]=naive_smooth:Naivebaselineimplementation;voidnaive_smooth(intdim,pixel*src,pixel*dst){inti,j;for(i=0;idim;i++)for(j=0;jdim;j++)dst[RIDX(i,j,dim)]=avg(dim,i,j,src);}2）分析这段代码很多次地调用avg函数，而avg函数内也频繁调用initialize_pixel_sum、accumulate_sum、assign_sum_to_pixel这几个函数，且含有2层for循环。虽然会以损害程序的模块性为代价，但消除函数调用的时间开销，得到的代码运行速度会快得多。所以，需要改写代码，不调用avg函数。Smooth函数处理分为以下3部分，一．主体内部，由9点求平均值；二．4个角，由4点求平均值；三．4条边界，由6点求平均值。由图片的顶部开始处理，再上边界，顺序处理下来，其中在处理左边界时，for循环处理一行主体部分3）优化代码charsmooth_descr_v1[]=smooth_v1:withlessfunccallandgrosslysimplifiedcalculationforcentralparts;voidsmooth_v1(intdim,pixel*src,pixel*dst){inti,j,ii,jj;pixel_sumsum;pixelcurrent_pixel,cp;for(j=0;jdim;j++){dst[RIDX(0,j,dim)]=avg(dim,0,j,src);dst[RIDX(dim-1,j,dim)]=avg(dim,dim-1,j,src);}for(i=0;idim;i++){dst[RIDX(i,0,dim)]=avg(dim,i,0,src);dst[RIDX(i,dim-1,dim)]=avg(dim,i,dim-1,src);}for(i=1;idim-1;i++)for(j=1;jdim-1;j++){sum.red=sum.green=sum.blue=0;for(ii=max(i-1,0);ii=min(i+1,dim-1);ii++)for(jj=max(j-1,0);jj=min(j+1,dim-1);jj++){cp=src[RIDX(ii,jj,dim)];sum.red+=cp.red;sum.green+=cp.green;sum.blue+=cp.blue;}current_pixel.red=sum.red/9;current_pixel.green=sum.green/9;current_pixel.blue=sum.blue/9;dst[RIDX(i,j,dim)]=current_pixel;}}charsmooth_descr_v2[]=smooth_v2:dividesrcinto3partsanduse3pointersforsmoothingc

程序性能调优

免费阅读已结束，点击付费阅读剩下 ... 页

阅读已结束，您可以下载文档离线阅读

第六章产品

MBA全景教程之十__管理信息系统

第5章XP的管理工具与系统结构

组织与人力资源-0304

甘肃省行政执法监督条例

高级会计职称精要考点总结

XXXX年3月、6月、9月、11月XXXX年3月《证券市场基础知

财务会计答案

去离子水生产记录

第五章-拉丁方设计

相关文档

相关搜索