您好,欢迎访问三七文档
当前位置:首页 > 商业/管理/HR > 项目/工程管理 > Python网络爬虫实习报告材料
..Python网络爬虫实习报告..目录一、选题背景.........................................-2-二、爬虫原理.........................................-2-三、爬虫历史和分类...................................-2-四、常用爬虫框架比较.................................-2-五、数据爬取实战(豆瓣网爬取电影数据)...............-3-1分析网页...........................................-3-2爬取数据...........................................-3-3数据整理、转换......................................-4-4数据保存、展示......................................-9-5技术难点关键点.....................................-10-六、总结............................................-13-..一、选题背景二、爬虫原理三、爬虫历史和分类四、常用爬虫框架比较Scrapy框架:Scrapy框架是一套比较成熟的Python爬虫框架,是使用Python开发的快速、高层次的信息爬取框架,可以高效的爬取web页面并提取出结构化数据。Scrapy应用范围很广,爬虫开发、数据挖掘、数据监测、自动化测试等。Crawley框架:Crawley也是Python开发出的爬虫框架,该框架致力于改变人们从互联网中提取数据的方式。Portia框架:Portia框架是一款允许没有任何编程基础的用户可视化地爬取网页的爬虫框架。newspaper框架:newspaper框架是一个用来提取新闻、文章以及内容分析的Python爬虫框架。Python-goose框架:Python-goose框架可提取的信息包括:1文章主体内容;2文章主要图片;3文章中嵌入的任heYoutube/Vimeo视频;4元描述;5元标签..五、数据爬取实战(豆瓣网爬取电影数据)1分析网页#获取html源代码def__getHtml():data=[]pageNum=1pageSize=0try:while(pageSize=125):#headers={'User-Agent':'Mozilla/5.0(WindowsNT6.1)AppleWebKit/537.11(KHTML,likeGecko)Chrome/23.0.1271.64Safari/537.11',#'Referer':None#注意如果依然不能抓取的话,这里可以设置抓取网站的host#}#opener=urllib.request.build_opener()#opener.addheaders=[headers]url==+str(pageSize)+&filter=+str(pageNum)#data['html%s'%i]=urllib.request.urlopen(url).read().decode(utf-8)..data.append(urllib.request.urlopen(url).read().decode(utf-8))pageSize+=25pageNum+=1print(pageSize,pageNum)exceptExceptionase:raiseereturndata2爬取数据def__getData(html):title=[]#电影标题#rating_num=[]#评分range_num=[]#排名#rating_people_num=[]#评价人数movie_author=[]#导演data={}#bs4解析htmlsoup=BeautifulSoup(html,html.parser)forliinsoup.find(ol,attrs={'class':'grid_view'}).find_all(li):..title.append(li.find(span,class_=title).text)#rating_num.append(li.find(div,class_='star').find(span,class_='rating_num').text)range_num.append(li.find(div,class_='pic').find(em).text)#spans=li.find(div,class_='star').find_all(span)#forxinrange(len(spans)):#ifx=2:#pass#else:#rating_people_num.append(spans[x].string[-len(spans[x].string):-3])str=li.find(div,class_='bd').find(p,class_='').text.lstrip()index=str.find(主)if(index==-1):index=str.find(...)print(li.find(div,class_='pic').find(em).text)if(li.find(div,class_='pic').find(em).text..==210):index=60#print(aaa)#print(str[4:index])movie_author.append(str[4:index])data['title']=title#data['rating_num']=rating_numdata['range_num']=range_num#data['rating_people_num']=rating_people_numdata['movie_author']=movie_authorreturndata3数据整理、转换def__getMovies(data):f=open('F://douban_movie.html','w',encoding='utf-8')f.write(html)f.write(headmetacharset='UTF-8'titleInserttitlehere/title/head)f.write(body)f.write(h1爬取豆瓣电影/h1)f.write(h4作者:刘文斌/h4)..f.write(h4时间:+nowtime+/h4)f.write(hr)f.write(tablewidth='800px'border='1'align=center)f.write(thead)f.write(tr)f.write(thfontsize='5'color=green电影/font/th)#f.write(thwidth='50px'fontsize='5'color=green评分/font/th)f.write(thwidth='50px'fontsize='5'color=green排名/font/th)#f.write(thwidth='100px'fontsize='5'color=green评价人数/font/th)f.write(thfontsize='5'color=green导演/font/th)f.write(/tr)f.write(/thead)f.write(tbody)fordataindatas:foriinrange(0,25):f.write(tr)f.write(td..style='color:orange;text-align:center'%s/td%data['title'][i])#f.write(tdstyle='color:blue;text-align:center'%s/td%data['rating_num'][i])f.write(tdstyle='color:red;text-align:center'%s/td%data['range_num'][i])#f.write(tdstyle='color:blue;text-align:center'%s/td%data['rating_people_num'][i])f.write(tdstyle='color:black;text-align:center'%s/td%data['movie_author'][i])f.write(/tr)f.write(/tbody)f.write(/thead)f.write(/table)f.write(/body)f.write(/html)f.close()..if__name__=='__main__':datas=[]htmls=__getHtml()foriinrange(len(htmls)):data=__getData(htmls[i])datas.append(data)__getMovies(datas)4数据保存、展示结果如后图所示:..5技术难点关键点数据爬取实战(搜房网爬取房屋数据)frombs4importBeautifulSoupimportrequestsrep=requests.get(')rep.encoding=gb2312#设置编码方式html=rep.textsoup=BeautifulSoup(html,'html.parser')..f=open('F://fang.html','w',encoding='utf-8')f.write(html)f.write(headmetacharset='UTF-8'titleInserttitlehere/title/head)f.write(body)f.write(centerh1新房成交TOP3/h1/center)f.write(tableborder='1px'width='1000px'height='800px'align=centertr)f.write(thh2房址/h2/th)f.write(thh2成交量/h2/th)f.write(thh2均价/h2/th/tr)forliinsoup.find(ul,class_=ul02).find_all(li):name=li.find(div,class_=pbtext).find(p).textchengjiaoliang=li.find(span,class_=red-f3).texttry:junjia=li.find(div,class_=ohter).find(p,class_=gray-9)#.text.replace('�O','平方米')exceptExceptionase:junjia=li.find(div,class_=gray-9)#.text.replace('�O','..平方米')f.write(trtdalign=centerfontsize='5px'color=red%s/font/td%name)f.write(tdalign=centerfontsize='5px'color=blue%s/font/td%chengjiaolia
本文标题:Python网络爬虫实习报告材料
链接地址:https://www.777doc.com/doc-1838709 .html