您好,欢迎访问三七文档
当前位置:首页 > 商业/管理/HR > 招聘面试 > JAVA实现爬取指定网站的数据
这个类是用来解析网站的内容重点是:div#pagediv#contentdiv#localdiv#recommendullia;这里用用firefox的firebug组件查看网页的代码结构,不同的网页路径也不一样。Java代码1.packagezy.crawl.hupu;2.3.importjava.io.IOException;4.5.importzy.crawl.common.*;6.7.importjava.util.ArrayList;8.importjava.util.List;9.10.importorg.apache.http.HttpEntity;11.importorg.apache.http.HttpHost;12.importorg.apache.http.HttpResponse;13.importorg.apache.http.HttpStatus;14.importorg.apache.http.client.HttpClient;15.importorg.apache.http.client.methods.HttpGet;16.importorg.apache.http.conn.params.ConnRoutePNames;17.importorg.apache.http.impl.client.DefaultHttpClient;18.importorg.apache.http.params.CoreConnectionPNames;19.importorg.apache.http.util.EntityUtils;20.importorg.jsoup.Jsoup;21.importorg.jsoup.nodes.Document;22.importorg.jsoup.nodes.Element;23.importorg.jsoup.select.Elements;24.25.publicclassCrawlHupu26.{27.privateListNewsInfonewsList=newArrayList();//用来存储爬取的信息对象28.29.publicStringGetHtml(Stringurl)//还方法是设置网络链接,是固定的用法30.{31.Stringhtml=null;32.HttpClienthttpClient=newDefaultHttpClient();33.//setproxy,becauseofnsn34.//HttpHostproxy=newHttpHost(10.68.120.11,3128);35.//httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY,proxy);36.37.//configurationtimeout38.httpClient.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT,20000);39.40.HttpGethttpGet=newHttpGet(url);41.try42.{43.HttpResponsehttpResponse=httpClient.execute(httpGet);44.intresStatu=httpResponse.getStatusLine().getStatusCode();45.if(resStatu==HttpStatus.SC_OK)46.{47.HttpEntityentity=httpResponse.getEntity();48.if(entity!=null)49.{50.html=EntityUtils.toString(entity);51.}52.}53.54.}55.catch(Exceptione)56.{57.System.out.println(Connect+url+error);58.e.printStackTrace();59.}60.finally61.{62.httpClient.getConnectionManager().shutdown();63.}64.65.returnhtml;66.}67.68.publicvoidParseHtmlForNewsList()69.{70.Stringhtml=GetHtml();71.72.//hupuvoice的第一个可以暂时去掉一个css,这样就不用处理空格了73.//StringcssQueryHupu=div.contentdiv.rowdiv.columndiv.rowdiv.columndiv.uiboxdiv.uibox-conul.ui-listlia;74.StringcssQueryHupu=div#mainbodydiv.cjkx_mtsddiv.cjkxul.list_leftlia;//这行是用来获取每条对象的标题信息75.//StringcssQueryHuxiu=div.container-hxdiv.row-fluid-wrap-hx76.//+div.center-container-hxdiv.clearfixdiv.center-ctr-wrapdiv.center-ctr-boxdiv.article-listdiv.article-boxdiv.article-box-ctth4a;77.//78.//StringcssQueryIteye=div#pagediv#contentdiv#localdiv#recommendullia;79.if(!html.isEmpty())80.{81.Documentdoc=Jsoup.parse(html,);82.ElementslinkElements=doc.select(cssQueryHupu);83./*84.*aclass=buttonreadhref=点击阅读/a85.*最后经过测试发现带空格的class可以写成两个select写成ElementsindexEs=doc.select(.button).select(.read);成功抓取该书所有目录和链接。86.*/87.88.//ElementslinkElements=doc.select(div.hp-wrap).select(div.index-wrapdiv.col-Bdiv.voice-maindiv.publicdiv#J_public_itemullidl.item-bddtspana);89.for(Elementele:linkElements)90.{91.92.NewsInfonewsTemp=newNewsInfo(ele.text(),ele.absUrl(href));93.94.PaserHtmlForNewsContent(newsTemp.getHtmlAddr(),newsTemp);95.newsList.add(newsTemp);96.//Stringhref=ele.attr(abs:href);也可以获取绝对地址97.98.//fortest99.System.out.println(newsTemp.getTitle()++newsTemp.getHtmlAddr());100.if(newsTemp.getImageAddrList()!=null)101.System.out.println(newsTemp.getImageAddrList().get(0));102.System.out.println(newsTemp.getContent());103.104.}//System.out.println(newsList.get(0).getContent());105.106.}107.}108.109.publicvoidPaserHtmlForNewsContent(StringcontentHtmlAddr,NewsInfonewsTemp)//通过上面获得的标题信息的连接,抓取标题的正文部分。110.{111.Stringhtml=GetHtml(contentHtmlAddr);112.StringcssQueryphoto=asdfas;113.StringcssQueryContent=//div#pageMaindiv.pageMainLeftdiv.detailWrapdiv.detailTitle+114.//+div#pageMaindiv.pageMainLeftdiv.detailWrapdiv.detailIntr115.div#pageMaindiv.pageMainLeftdiv.detailWrapdiv.detail;116.//StringcssQueryContent=div.contentdiv.rowdiv.columndiv#articlewrap.area;117.//StringcssQueryphoto=div.hp-wrapdiv.voice-maindiv.voice-itemullidiv.voice-read-detaileddiv.voice-photoVideo118.//+div.voice-photodiv.small-imgimg;119.if(!html.isEmpty())120.{121.Documentdoc=Jsoup.parse(html);122.ElementscontentElements=doc.select(cssQueryContent);123.ElementsimgElements=doc.select(cssQueryphoto);124.for(Elementele:contentElements)125.{126.newsTemp.setContent(ele.html());127.}128.for(Elementele:imgElements)129.{130.ListStringtempImgList=newArrayList();131.tempImgList.add(ele.attr(src));132.newsTemp.setImageAddrList(tempImgList);133.}134.135.}136.}137.138.publicstaticvoidmain(String[]args)139.{140.CrawlHupucrawlHupu=newCrawlHupu();141.crawlHupu.ParseHtmlForNewsList();142.143.}144.145.}2.这个是要获取的信息的类。不多解释。Java代码1.packagezy.crawl.common;2.3.importjava.util.List;4.5.publicclassNewsInfo6.{7.privateStringtitle;8.privateStringhtmlAddr;9.privateStringcontent;10.privateListStringimageAddrList;11.12.13.publicNewsInfo(Stringtitle,StringhtmlAddr)14.{15.super();16.this.title=title;17.this.htmlAddr=htmlAddr;18.}19.20.21.publicNewsInfo(St
本文标题:JAVA实现爬取指定网站的数据
链接地址:https://www.777doc.com/doc-4239442 .html